Эх сурвалжийг харах

Merge tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs updates from Dave Chinner:
 "There is quite a varied bunch of stuff in this update, and some of it
  you will have already merged through the ext4 tree which imported the
  dax-4.10-iomap-pmd topic branch from the XFS tree.

  There is also a new direct IO implementation that uses the iomap
  infrastructure. It's much simpler, faster, and has lower IO latency
  than the existing direct IO infrastructure.

  Summary:
   - DAX PMD faults via iomap infrastructure
   - Direct-io support in iomap infrastructure
   - removal of now-redundant XFS inode iolock, replaced with VFS
     i_rwsem
   - synchronisation with fixes and changes in userspace libxfs code
   - extent tree lookup helpers
   - lots of little corruption detection improvements to verifiers
   - optimised CRC calculations
   - faster buffer cache lookups
   - deprecation of barrier/nobarrier mount options - we always use
     REQ_FUA/REQ_FLUSH where appropriate for data integrity now
   - cleanups to speculative preallocation
   - miscellaneous minor bug fixes and cleanups"

* tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (63 commits)
  xfs: nuke unused tracepoint definitions
  xfs: use GPF_NOFS when allocating btree cursors
  xfs: use xfs_vn_setattr_size to check on new size
  xfs: deprecate barrier/nobarrier mount option
  xfs: Always flush caches when integrity is required
  xfs: ignore leaf attr ichdr.count in verifier during log replay
  xfs: use rhashtable to track buffer cache
  xfs: optimise CRC updates
  xfs: make xfs btree stats less huge
  xfs: don't cap maximum dedupe request length
  xfs: don't allow di_size with high bit set
  xfs: error out if trying to add attrs and anextents > 0
  xfs: don't crash if reading a directory results in an unexpected hole
  xfs: complain if we don't get nextents bmap records
  xfs: check for bogus values in btree block headers
  xfs: forbid AG btrees with level == 0
  xfs: several xattr functions can be void
  xfs: handle cow fork in xfs_bmap_trace_exlist
  xfs: pass state not whichfork to trace_xfs_extlist
  xfs: Move AGI buffer type setting to xfs_read_agi
  ...
Linus Torvalds 8 жил өмнө
parent
commit
5cc60aeedf
67 өөрчлөгдсөн 1357 нэмэгдсэн , 1464 устгасан
  1. 4 8
      Documentation/filesystems/xfs.txt
  2. 1 1
      fs/direct-io.c
  3. 3 0
      fs/internal.h
  4. 373 0
      fs/iomap.c
  5. 7 3
      fs/xfs/libxfs/xfs_alloc.c
  6. 5 1
      fs/xfs/libxfs/xfs_alloc_btree.c
  7. 7 1
      fs/xfs/libxfs/xfs_attr_leaf.c
  8. 2 2
      fs/xfs/libxfs/xfs_attr_leaf.h
  9. 135 201
      fs/xfs/libxfs/xfs_bmap.c
  10. 2 7
      fs/xfs/libxfs/xfs_bmap.h
  11. 2 1
      fs/xfs/libxfs/xfs_bmap_btree.c
  12. 20 0
      fs/xfs/libxfs/xfs_btree.c
  13. 4 39
      fs/xfs/libxfs/xfs_btree.h
  14. 22 4
      fs/xfs/libxfs/xfs_cksum.h
  15. 1 1
      fs/xfs/libxfs/xfs_dir2.c
  16. 5 0
      fs/xfs/libxfs/xfs_dir2.h
  17. 18 8
      fs/xfs/libxfs/xfs_dir2_data.c
  18. 0 1
      fs/xfs/libxfs/xfs_dir2_priv.h
  19. 13 5
      fs/xfs/libxfs/xfs_ialloc.c
  20. 3 1
      fs/xfs/libxfs/xfs_ialloc_btree.c
  21. 12 4
      fs/xfs/libxfs/xfs_inode_buf.c
  22. 2 2
      fs/xfs/libxfs/xfs_inode_buf.h
  23. 65 12
      fs/xfs/libxfs/xfs_inode_fork.c
  24. 7 0
      fs/xfs/libxfs/xfs_inode_fork.h
  25. 2 2
      fs/xfs/libxfs/xfs_log_format.h
  26. 1 1
      fs/xfs/libxfs/xfs_log_recover.h
  27. 1 0
      fs/xfs/libxfs/xfs_refcount_btree.c
  28. 1 0
      fs/xfs/libxfs/xfs_rmap_btree.c
  29. 0 1
      fs/xfs/libxfs/xfs_rtbitmap.c
  30. 11 2
      fs/xfs/libxfs/xfs_sb.c
  31. 3 1
      fs/xfs/libxfs/xfs_types.h
  32. 13 273
      fs/xfs/xfs_aops.c
  33. 0 6
      fs/xfs/xfs_aops.h
  34. 2 2
      fs/xfs/xfs_attr.h
  35. 22 37
      fs/xfs/xfs_attr_list.c
  36. 20 25
      fs/xfs/xfs_bmap_util.c
  37. 74 49
      fs/xfs/xfs_buf.c
  38. 2 1
      fs/xfs/xfs_buf.h
  39. 0 2
      fs/xfs/xfs_dir2_readdir.c
  40. 94 170
      fs/xfs/xfs_file.c
  41. 22 18
      fs/xfs/xfs_icache.c
  42. 1 1
      fs/xfs/xfs_icreate_item.c
  43. 33 51
      fs/xfs/xfs_inode.c
  44. 8 10
      fs/xfs/xfs_inode.h
  45. 2 2
      fs/xfs/xfs_inode_item.c
  46. 3 5
      fs/xfs/xfs_ioctl.c
  47. 64 40
      fs/xfs/xfs_iomap.c
  48. 6 8
      fs/xfs/xfs_iops.c
  49. 1 0
      fs/xfs/xfs_linux.h
  50. 17 24
      fs/xfs/xfs_log.c
  51. 9 7
      fs/xfs/xfs_log_recover.c
  52. 5 2
      fs/xfs/xfs_mount.c
  53. 5 2
      fs/xfs/xfs_mount.h
  54. 1 6
      fs/xfs/xfs_pnfs.c
  55. 2 2
      fs/xfs/xfs_pnfs.h
  56. 1 1
      fs/xfs/xfs_qm.c
  57. 54 137
      fs/xfs/xfs_reflink.c
  58. 2 4
      fs/xfs/xfs_reflink.h
  59. 5 5
      fs/xfs/xfs_stats.c
  60. 81 119
      fs/xfs/xfs_stats.h
  61. 17 10
      fs/xfs/xfs_super.c
  62. 3 4
      fs/xfs/xfs_symlink.c
  63. 0 109
      fs/xfs/xfs_trace.h
  64. 10 13
      fs/xfs/xfs_xattr.c
  65. 11 0
      include/linux/iomap.h
  66. 23 2
      include/linux/lockdep.h
  67. 12 8
      kernel/locking/lockdep.c

+ 4 - 8
Documentation/filesystems/xfs.txt

@@ -51,13 +51,6 @@ default behaviour.
 	CRC enabled filesystems always use the attr2 format, and so
 	CRC enabled filesystems always use the attr2 format, and so
 	will reject the noattr2 mount option if it is set.
 	will reject the noattr2 mount option if it is set.
 
 
-  barrier (*)
-  nobarrier
-	Enables/disables the use of block layer write barriers for
-	writes into the journal and for data integrity operations.
-	This allows for drive level write caching to be enabled, for
-	devices that support write barriers.
-
   discard
   discard
   nodiscard (*)
   nodiscard (*)
 	Enable/disable the issuing of commands to let the block
 	Enable/disable the issuing of commands to let the block
@@ -228,7 +221,10 @@ default behaviour.
 Deprecated Mount Options
 Deprecated Mount Options
 ========================
 ========================
 
 
-None at present.
+  Name				Removal Schedule
+  ----				----------------
+  barrier			no earlier than v4.15
+  nobarrier			no earlier than v4.15
 
 
 
 
 Removed Mount Options
 Removed Mount Options

+ 1 - 1
fs/direct-io.c

@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  * filesystems that don't need it and also allows us to create the workqueue
  * filesystems that don't need it and also allows us to create the workqueue
  * late enough so the we can include s_id in the name of the workqueue.
  * late enough so the we can include s_id in the name of the workqueue.
  */
  */
-static int sb_init_dio_done_wq(struct super_block *sb)
+int sb_init_dio_done_wq(struct super_block *sb)
 {
 {
 	struct workqueue_struct *old;
 	struct workqueue_struct *old;
 	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
 	struct workqueue_struct *wq = alloc_workqueue("dio/%s",

+ 3 - 0
fs/internal.h

@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
 loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
 		unsigned flags, struct iomap_ops *ops, void *data,
 		unsigned flags, struct iomap_ops *ops, void *data,
 		iomap_actor_t actor);
 		iomap_actor_t actor);
+
+/* direct-io.c: */
+int sb_init_dio_done_wq(struct super_block *sb);

+ 373 - 0
fs/iomap.c

@@ -24,6 +24,7 @@
 #include <linux/uio.h>
 #include <linux/uio.h>
 #include <linux/backing-dev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
 #include <linux/dax.h>
 #include "internal.h"
 #include "internal.h"
 
 
@@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
 	return 0;
 	return 0;
 }
 }
 EXPORT_SYMBOL_GPL(iomap_fiemap);
 EXPORT_SYMBOL_GPL(iomap_fiemap);
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE		(1 << 30)
+#define IOMAP_DIO_DIRTY		(1 << 31)
+
+struct iomap_dio {
+	struct kiocb		*iocb;
+	iomap_dio_end_io_t	*end_io;
+	loff_t			i_size;
+	loff_t			size;
+	atomic_t		ref;
+	unsigned		flags;
+	int			error;
+
+	union {
+		/* used during submission and for synchronous completion: */
+		struct {
+			struct iov_iter		*iter;
+			struct task_struct	*waiter;
+			struct request_queue	*last_queue;
+			blk_qc_t		cookie;
+		} submit;
+
+		/* used for aio completion: */
+		struct {
+			struct work_struct	work;
+		} aio;
+	};
+};
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+	struct kiocb *iocb = dio->iocb;
+	ssize_t ret;
+
+	if (dio->end_io) {
+		ret = dio->end_io(iocb,
+				dio->error ? dio->error : dio->size,
+				dio->flags);
+	} else {
+		ret = dio->error;
+	}
+
+	if (likely(!ret)) {
+		ret = dio->size;
+		/* check for short read */
+		if (iocb->ki_pos + ret > dio->i_size &&
+		    !(dio->flags & IOMAP_DIO_WRITE))
+			ret = dio->i_size - iocb->ki_pos;
+		iocb->ki_pos += ret;
+	}
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+	kfree(dio);
+
+	return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+	struct kiocb *iocb = dio->iocb;
+	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
+	ssize_t ret;
+
+	ret = iomap_dio_complete(dio);
+	if (is_write && ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	iocb->ki_complete(iocb, ret, 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet.  We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+	cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (bio->bi_error)
+		iomap_dio_set_error(dio, bio->bi_error);
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		if (is_sync_kiocb(dio->iocb)) {
+			struct task_struct *waiter = dio->submit.waiter;
+
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			wake_up_process(waiter);
+		} else if (dio->flags & IOMAP_DIO_WRITE) {
+			struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		} else {
+			iomap_dio_complete_work(&dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		struct bio_vec *bvec;
+		int i;
+
+		bio_for_each_segment_all(bvec, bio, i)
+			put_page(bvec->bv_page);
+		bio_put(bio);
+	}
+}
+
+static blk_qc_t
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+		unsigned len)
+{
+	struct page *page = ZERO_PAGE(0);
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio->bi_bdev = iomap->bdev;
+	bio->bi_iter.bi_sector =
+		iomap->blkno + ((pos - iomap->offset) >> 9);
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	get_page(page);
+	if (bio_add_page(bio, page, len, 0) != len)
+		BUG();
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+
+	atomic_inc(&dio->ref);
+	return submit_bio(bio);
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_dio *dio = data;
+	unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned fs_block_size = (1 << inode->i_blkbits), pad;
+	unsigned align = iov_iter_alignment(dio->submit.iter);
+	struct iov_iter iter;
+	struct bio *bio;
+	bool need_zeroout = false;
+	int nr_pages, ret;
+
+	if ((pos | length | align) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+			return -EIO;
+		/*FALLTHRU*/
+	case IOMAP_UNWRITTEN:
+		if (!(dio->flags & IOMAP_DIO_WRITE)) {
+			iov_iter_zero(length, dio->submit.iter);
+			dio->size += length;
+			return length;
+		}
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		need_zeroout = true;
+		break;
+	case IOMAP_MAPPED:
+		if (iomap->flags & IOMAP_F_SHARED)
+			dio->flags |= IOMAP_DIO_COW;
+		if (iomap->flags & IOMAP_F_NEW)
+			need_zeroout = true;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+
+	/*
+	 * Operate on a partial iter trimmed to the extent we were called for.
+	 * We'll update the iter in the dio once we're done with this extent.
+	 */
+	iter = *dio->submit.iter;
+	iov_iter_truncate(&iter, length);
+
+	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0)
+		return nr_pages;
+
+	if (need_zeroout) {
+		/* zero out from the start of the block to the write offset */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos - pad, pad);
+	}
+
+	do {
+		if (dio->error)
+			return 0;
+
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio->bi_bdev = iomap->bdev;
+		bio->bi_iter.bi_sector =
+			iomap->blkno + ((pos - iomap->offset) >> 9);
+		bio->bi_private = dio;
+		bio->bi_end_io = iomap_dio_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, &iter);
+		if (unlikely(ret)) {
+			bio_put(bio);
+			return ret;
+		}
+
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+			task_io_account_write(bio->bi_iter.bi_size);
+		} else {
+			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
+		dio->size += bio->bi_iter.bi_size;
+		pos += bio->bi_iter.bi_size;
+
+		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+
+		atomic_inc(&dio->ref);
+
+		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+		dio->submit.cookie = submit_bio(bio);
+	} while (nr_pages);
+
+	if (need_zeroout) {
+		/* zero out from the end of the write to the end of the block */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+	}
+
+	iov_iter_advance(dio->submit.iter, length);
+	return length;
+}
+
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
+		iomap_dio_end_io_t end_io)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
+	unsigned int flags = IOMAP_DIRECT;
+	struct blk_plug plug;
+	struct iomap_dio *dio;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (!count)
+		return 0;
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->size = 0;
+	dio->i_size = i_size_read(inode);
+	dio->end_io = end_io;
+	dio->error = 0;
+	dio->flags = 0;
+
+	dio->submit.iter = iter;
+	if (is_sync_kiocb(iocb)) {
+		dio->submit.waiter = current;
+		dio->submit.cookie = BLK_QC_T_NONE;
+		dio->submit.last_queue = NULL;
+	}
+
+	if (iov_iter_rw(iter) == READ) {
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+
+		if (iter->type == ITER_IOVEC)
+			dio->flags |= IOMAP_DIO_DIRTY;
+	} else {
+		dio->flags |= IOMAP_DIO_WRITE;
+		flags |= IOMAP_WRITE;
+	}
+
+	if (mapping->nrpages) {
+		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+		if (ret)
+			goto out_free_dio;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+		ret = 0;
+	}
+
+	inode_dio_begin(inode);
+
+	blk_start_plug(&plug);
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+				iomap_dio_actor);
+		if (ret <= 0) {
+			/* magic error code to fall back to buffered I/O */
+			if (ret == -ENOTBLK)
+				ret = 0;
+			break;
+		}
+		pos += ret;
+	} while ((count = iov_iter_count(iter)) > 0);
+	blk_finish_plug(&plug);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+			!inode->i_sb->s_dio_done_wq) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			iomap_dio_set_error(dio, ret);
+	}
+
+	if (!atomic_dec_and_test(&dio->ref)) {
+		if (!is_sync_kiocb(iocb))
+			return -EIOCBQUEUED;
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			if (!(iocb->ki_flags & IOCB_HIPRI) ||
+			    !dio->submit.last_queue ||
+			    !blk_mq_poll(dio->submit.last_queue,
+					 dio->submit.cookie))
+				io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 */
+	if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+		ret = invalidate_inode_pages2_range(mapping,
+				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+	}
+
+	return iomap_dio_complete(dio);
+
+out_free_dio:
+	kfree(dio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);

+ 7 - 3
fs/xfs/libxfs/xfs_alloc.c

@@ -2455,12 +2455,15 @@ xfs_agf_verify(
 	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
 	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
 		return false;
 		return false;
 
 
-	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
 		return false;
 		return false;
 
 
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
-	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
+	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
 		return false;
 		return false;
 
 
 	/*
 	/*
@@ -2477,7 +2480,8 @@ xfs_agf_verify(
 		return false;
 		return false;
 
 
 	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
 	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
-	    be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+	     be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
 		return false;
 		return false;
 
 
 	return true;;
 	return true;;

+ 5 - 1
fs/xfs/libxfs/xfs_alloc_btree.c

@@ -421,13 +421,17 @@ xfs_allocbt_init_cursor(
 
 
 	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
 	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
 
 
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
 
 	cur->bc_tp = tp;
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_mp = mp;
 	cur->bc_btnum = btnum;
 	cur->bc_btnum = btnum;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_ops = &xfs_allocbt_ops;
 	cur->bc_ops = &xfs_allocbt_ops;
+	if (btnum == XFS_BTNUM_BNO)
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+	else
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
 
 
 	if (btnum == XFS_BTNUM_CNT) {
 	if (btnum == XFS_BTNUM_CNT) {
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);

+ 7 - 1
fs/xfs/libxfs/xfs_attr_leaf.c

@@ -253,6 +253,7 @@ xfs_attr3_leaf_verify(
 {
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_attr_leafblock *leaf = bp->b_addr;
 	struct xfs_attr_leafblock *leaf = bp->b_addr;
+	struct xfs_perag *pag = bp->b_pag;
 	struct xfs_attr3_icleaf_hdr ichdr;
 	struct xfs_attr3_icleaf_hdr ichdr;
 
 
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -273,7 +274,12 @@ xfs_attr3_leaf_verify(
 		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
 		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
 			return false;
 			return false;
 	}
 	}
-	if (ichdr.count == 0)
+	/*
+	 * In recovery there is a transient state where count == 0 is valid
+	 * because we may have transitioned an empty shortform attr to a leaf
+	 * if the attr didn't fit in shortform.
+	 */
+	if (pag && pag->pagf_init && ichdr.count == 0)
 		return false;
 		return false;
 
 
 	/* XXX: need to range check rest of attr header values */
 	/* XXX: need to range check rest of attr header values */

+ 2 - 2
fs/xfs/libxfs/xfs_attr_leaf.h

@@ -51,7 +51,7 @@ int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
 int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
-int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
 
 
 /*
 /*
@@ -77,7 +77,7 @@ int	xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
 				 struct xfs_da_args *args);
 				 struct xfs_da_args *args);
 int	xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
 int	xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
 				    struct xfs_da_args *args);
 				    struct xfs_da_args *args);
-int	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+void	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
 				      struct xfs_attr_list_context *context);
 				      struct xfs_attr_list_context *context);
 
 
 /*
 /*

+ 135 - 201
fs/xfs/libxfs/xfs_bmap.c

@@ -49,6 +49,8 @@
 #include "xfs_rmap.h"
 #include "xfs_rmap.h"
 #include "xfs_ag_resv.h"
 #include "xfs_ag_resv.h"
 #include "xfs_refcount.h"
 #include "xfs_refcount.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_icache.h"
 
 
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
 kmem_zone_t		*xfs_bmap_free_item_zone;
@@ -190,8 +192,12 @@ xfs_bmap_worst_indlen(
 	int		maxrecs;	/* maximum record count at this level */
 	int		maxrecs;	/* maximum record count at this level */
 	xfs_mount_t	*mp;		/* mount structure */
 	xfs_mount_t	*mp;		/* mount structure */
 	xfs_filblks_t	rval;		/* return value */
 	xfs_filblks_t	rval;		/* return value */
+	xfs_filblks_t   orig_len;
 
 
 	mp = ip->i_mount;
 	mp = ip->i_mount;
+
+	/* Calculate the worst-case size of the bmbt. */
+	orig_len = len;
 	maxrecs = mp->m_bmap_dmxr[0];
 	maxrecs = mp->m_bmap_dmxr[0];
 	for (level = 0, rval = 0;
 	for (level = 0, rval = 0;
 	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
 	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
@@ -199,12 +205,20 @@ xfs_bmap_worst_indlen(
 		len += maxrecs - 1;
 		len += maxrecs - 1;
 		do_div(len, maxrecs);
 		do_div(len, maxrecs);
 		rval += len;
 		rval += len;
-		if (len == 1)
-			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+		if (len == 1) {
+			rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
 				level - 1;
 				level - 1;
+			break;
+		}
 		if (level == 0)
 		if (level == 0)
 			maxrecs = mp->m_bmap_dmxr[1];
 			maxrecs = mp->m_bmap_dmxr[1];
 	}
 	}
+
+	/* Calculate the worst-case size of the rmapbt. */
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) +
+				mp->m_rmap_maxlevels;
+
 	return rval;
 	return rval;
 }
 }
 
 
@@ -504,7 +518,7 @@ void
 xfs_bmap_trace_exlist(
 xfs_bmap_trace_exlist(
 	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_extnum_t	cnt,		/* count of entries in the list */
 	xfs_extnum_t	cnt,		/* count of entries in the list */
-	int		whichfork,	/* data or attr fork */
+	int		whichfork,	/* data or attr or cow fork */
 	unsigned long	caller_ip)
 	unsigned long	caller_ip)
 {
 {
 	xfs_extnum_t	idx;		/* extent record index */
 	xfs_extnum_t	idx;		/* extent record index */
@@ -513,11 +527,13 @@ xfs_bmap_trace_exlist(
 
 
 	if (whichfork == XFS_ATTR_FORK)
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
 		state |= BMAP_ATTRFORK;
+	else if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(cnt == xfs_iext_count(ifp));
 	for (idx = 0; idx < cnt; idx++)
 	for (idx = 0; idx < cnt; idx++)
-		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+		trace_xfs_extlist(ip, idx, state, caller_ip);
 }
 }
 
 
 /*
 /*
@@ -811,7 +827,7 @@ try_another_ag:
 				XFS_BTREE_LONG_PTRS);
 				XFS_BTREE_LONG_PTRS);
 
 
 	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
 	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents =  xfs_iext_count(ifp);
 	for (cnt = i = 0; i < nextents; i++) {
 	for (cnt = i = 0; i < nextents; i++) {
 		ep = xfs_iext_get_ext(ifp, i);
 		ep = xfs_iext_get_ext(ifp, i);
 		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
 		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
@@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork(
 		goto trans_cancel;
 		goto trans_cancel;
 	if (XFS_IFORK_Q(ip))
 	if (XFS_IFORK_Q(ip))
 		goto trans_cancel;
 		goto trans_cancel;
+	if (ip->i_d.di_anextents != 0) {
+		error = -EFSCORRUPTED;
+		goto trans_cancel;
+	}
 	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
 	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
 		/*
 		/*
 		 * For inodes coming from pre-6.2 filesystems.
 		 * For inodes coming from pre-6.2 filesystems.
@@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork(
 		ASSERT(ip->i_d.di_aformat == 0);
 		ASSERT(ip->i_d.di_aformat == 0);
 		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	}
 	}
-	ASSERT(ip->i_d.di_anextents == 0);
 
 
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1296,7 +1315,7 @@ xfs_bmap_read_extents(
 	/*
 	/*
 	 * Here with bp and block set to the leftmost leaf node in the tree.
 	 * Here with bp and block set to the leftmost leaf node in the tree.
 	 */
 	 */
-	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	room = xfs_iext_count(ifp);
 	i = 0;
 	i = 0;
 	/*
 	/*
 	 * Loop over all leaf nodes.  Copy information to the extent records.
 	 * Loop over all leaf nodes.  Copy information to the extent records.
@@ -1361,8 +1380,9 @@ xfs_bmap_read_extents(
 			return error;
 			return error;
 		block = XFS_BUF_TO_BLOCK(bp);
 		block = XFS_BUF_TO_BLOCK(bp);
 	}
 	}
-	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	if (i != XFS_IFORK_NEXTENTS(ip, whichfork))
+		return -EFSCORRUPTED;
+	ASSERT(i == xfs_iext_count(ifp));
 	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
 	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
 	return 0;
 	return 0;
 error0:
 error0:
@@ -1370,97 +1390,6 @@ error0:
 	return -EFSCORRUPTED;
 	return -EFSCORRUPTED;
 }
 }
 
 
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fileoff_t	bno,		/* block number searched for */
-	int		*eofp,		/* out: end of file found */
-	xfs_extnum_t	*lastxp,	/* out: last extent index */
-	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
-	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
-{
-	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
-	xfs_extnum_t	lastx;		/* last extent index */
-
-	/*
-	 * Initialize the extent entry structure to catch access to
-	 * uninitialized br_startblock field.
-	 */
-	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
-	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
-	gotp->br_state = XFS_EXT_INVALID;
-	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-	prevp->br_startoff = NULLFILEOFF;
-
-	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
-	if (lastx > 0) {
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
-	}
-	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
-		xfs_bmbt_get_all(ep, gotp);
-		*eofp = 0;
-	} else {
-		if (lastx > 0) {
-			*gotp = *prevp;
-		}
-		*eofp = 1;
-		ep = NULL;
-	}
-	*lastxp = lastx;
-	return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry.  If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
-xfs_bmap_search_extents(
-	xfs_inode_t     *ip,            /* incore inode pointer */
-	xfs_fileoff_t   bno,            /* block number searched for */
-	int             fork,      	/* data or attr fork */
-	int             *eofp,          /* out: end of file found */
-	xfs_extnum_t    *lastxp,        /* out: last extent index */
-	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
-	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
-
-	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
-	ifp = XFS_IFORK_PTR(ip, fork);
-
-	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
-	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
-		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-				"Access to block zero in inode %llu "
-				"start_block: %llx start_off: %llx "
-				"blkcnt: %llx extent-state: %x lastx: %x",
-			(unsigned long long)ip->i_ino,
-			(unsigned long long)gotp->br_startblock,
-			(unsigned long long)gotp->br_startoff,
-			(unsigned long long)gotp->br_blockcount,
-			gotp->br_state, *lastxp);
-		*lastxp = NULLEXTNUM;
-		*eofp = 1;
-		return NULL;
-	}
-	return ep;
-}
-
 /*
 /*
  * Returns the file-relative block number of the first unused block(s)
  * Returns the file-relative block number of the first unused block(s)
  * in the file with at least "len" logically contiguous blocks free.
  * in the file with at least "len" logically contiguous blocks free.
@@ -1497,7 +1426,7 @@ xfs_bmap_first_unused(
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
 		return error;
 	lowest = *first_unused;
 	lowest = *first_unused;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
 	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
 		off = xfs_bmbt_get_startoff(ep);
 		off = xfs_bmbt_get_startoff(ep);
@@ -1523,44 +1452,44 @@ xfs_bmap_first_unused(
  */
  */
 int						/* error */
 int						/* error */
 xfs_bmap_last_before(
 xfs_bmap_last_before(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_fileoff_t	*last_block,		/* last block */
-	int		whichfork)		/* data or attr fork */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*last_block,	/* last block */
+	int			whichfork)	/* data or attr fork */
 {
 {
-	xfs_fileoff_t	bno;			/* input file offset */
-	int		eof;			/* hit end of file */
-	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
-	int		error;			/* error return value */
-	xfs_bmbt_irec_t	got;			/* current extent value */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	lastx;			/* last extent used */
-	xfs_bmbt_irec_t	prev;			/* previous extent value */
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	xfs_extnum_t		idx;
+	int			error;
 
 
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-	       return -EIO;
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
 		*last_block = 0;
 		*last_block = 0;
 		return 0;
 		return 0;
+	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_EXTENTS:
+		break;
+	default:
+		return -EIO;
 	}
 	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	bno = *last_block - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
-		if (prev.br_startoff == NULLFILEOFF)
-			*last_block = 0;
-		else
-			*last_block = prev.br_startoff + prev.br_blockcount;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
 	}
 	}
-	/*
-	 * Otherwise *last_block is already the right answer.
-	 */
+
+	if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) {
+		if (got.br_startoff <= *last_block - 1)
+			return 0;
+	}
+
+	if (xfs_iext_get_extent(ifp, idx - 1, &got)) {
+		*last_block = got.br_startoff + got.br_blockcount;
+		return 0;
+	}
+
+	*last_block = 0;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1582,7 +1511,7 @@ xfs_bmap_last_extent(
 			return error;
 			return error;
 	}
 	}
 
 
-	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	if (nextents == 0) {
 	if (nextents == 0) {
 		*is_empty = 1;
 		*is_empty = 1;
 		return 0;
 		return 0;
@@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real(
 						&bma->ip->i_d.di_nextents);
 						&bma->ip->i_d.di_nextents);
 
 
 	ASSERT(bma->idx >= 0);
 	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(bma->idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	ASSERT(!bma->cur ||
 	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
 	 */
-	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (bma->idx < xfs_iext_count(ifp) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
 
 
@@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real(
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 
 
 	ASSERT(*idx >= 0);
 	ASSERT(*idx >= 0);
-	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!isnullstartblock(new->br_startblock));
 
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 	XFS_STATS_INC(mp, xs_add_exlist);
@@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (*idx < xfs_iext_count(&ip->i_df) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 		if (isnullstartblock(RIGHT.br_startblock))
 		if (isnullstartblock(RIGHT.br_startblock))
@@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay(
 	 * Check and set flags if the current (right) segment exists.
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
 	 */
-	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (*idx < xfs_iext_count(ifp)) {
 		state |= BMAP_RIGHT_VALID;
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 
 
@@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real(
 	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 
 
 	ASSERT(bma->idx >= 0);
 	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(bma->idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
 	 */
-	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (bma->idx < xfs_iext_count(ifp)) {
 		state |= BMAP_RIGHT_VALID;
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
 		if (isnullstartblock(right.br_startblock))
 		if (isnullstartblock(right.br_startblock))
@@ -4145,12 +4074,11 @@ xfs_bmapi_read(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp;
 	struct xfs_ifork	*ifp;
 	struct xfs_bmbt_irec	got;
 	struct xfs_bmbt_irec	got;
-	struct xfs_bmbt_irec	prev;
 	xfs_fileoff_t		obno;
 	xfs_fileoff_t		obno;
 	xfs_fileoff_t		end;
 	xfs_fileoff_t		end;
-	xfs_extnum_t		lastx;
+	xfs_extnum_t		idx;
 	int			error;
 	int			error;
-	int			eof;
+	bool			eof = false;
 	int			n = 0;
 	int			n = 0;
 	int			whichfork = xfs_bmapi_whichfork(flags);
 	int			whichfork = xfs_bmapi_whichfork(flags);
 
 
@@ -4190,7 +4118,8 @@ xfs_bmapi_read(
 			return error;
 			return error;
 	}
 	}
 
 
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
+		eof = true;
 	end = bno + len;
 	end = bno + len;
 	obno = bno;
 	obno = bno;
 
 
@@ -4221,10 +4150,8 @@ xfs_bmapi_read(
 			break;
 			break;
 
 
 		/* Else go on to the next record. */
 		/* Else go on to the next record. */
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-		else
-			eof = 1;
+		if (!xfs_iext_get_extent(ifp, ++idx, &got))
+			eof = true;
 	}
 	}
 	*nmap = n;
 	*nmap = n;
 	return 0;
 	return 0;
@@ -4234,10 +4161,10 @@ int
 xfs_bmapi_reserve_delalloc(
 xfs_bmapi_reserve_delalloc(
 	struct xfs_inode	*ip,
 	struct xfs_inode	*ip,
 	int			whichfork,
 	int			whichfork,
-	xfs_fileoff_t		aoff,
+	xfs_fileoff_t		off,
 	xfs_filblks_t		len,
 	xfs_filblks_t		len,
+	xfs_filblks_t		prealloc,
 	struct xfs_bmbt_irec	*got,
 	struct xfs_bmbt_irec	*got,
-	struct xfs_bmbt_irec	*prev,
 	xfs_extnum_t		*lastx,
 	xfs_extnum_t		*lastx,
 	int			eof)
 	int			eof)
 {
 {
@@ -4248,10 +4175,17 @@ xfs_bmapi_reserve_delalloc(
 	char			rt = XFS_IS_REALTIME_INODE(ip);
 	char			rt = XFS_IS_REALTIME_INODE(ip);
 	xfs_extlen_t		extsz;
 	xfs_extlen_t		extsz;
 	int			error;
 	int			error;
+	xfs_fileoff_t		aoff = off;
 
 
-	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+	/*
+	 * Cap the alloc length. Keep track of prealloc so we know whether to
+	 * tag the inode before we return.
+	 */
+	alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
 	if (!eof)
 	if (!eof)
 		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
 		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+	if (prealloc && alen >= len)
+		prealloc = alen - len;
 
 
 	/* Figure out the extent size, adjust alen */
 	/* Figure out the extent size, adjust alen */
 	if (whichfork == XFS_COW_FORK)
 	if (whichfork == XFS_COW_FORK)
@@ -4259,7 +4193,12 @@ xfs_bmapi_reserve_delalloc(
 	else
 	else
 		extsz = xfs_get_extsz_hint(ip);
 		extsz = xfs_get_extsz_hint(ip);
 	if (extsz) {
 	if (extsz) {
-		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+		struct xfs_bmbt_irec	prev;
+
+		if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev))
+			prev.br_startoff = NULLFILEOFF;
+
+		error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof,
 					       1, 0, &aoff, &alen);
 					       1, 0, &aoff, &alen);
 		ASSERT(!error);
 		ASSERT(!error);
 	}
 	}
@@ -4312,6 +4251,16 @@ xfs_bmapi_reserve_delalloc(
 	 */
 	 */
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
 
 
+	/*
+	 * Tag the inode if blocks were preallocated. Note that COW fork
+	 * preallocation can occur at the start or end of the extent, even when
+	 * prealloc == 0, so we must also check the aligned offset and length.
+	 */
+	if (whichfork == XFS_DATA_FORK && prealloc)
+		xfs_inode_set_eofblocks_tag(ip);
+	if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+		xfs_inode_set_cowblocks_tag(ip);
+
 	ASSERT(got->br_startoff <= aoff);
 	ASSERT(got->br_startoff <= aoff);
 	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
 	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
 	ASSERT(isnullstartblock(got->br_startblock));
 	ASSERT(isnullstartblock(got->br_startblock));
@@ -4349,7 +4298,7 @@ xfs_bmapi_allocate(
 	if (bma->wasdel) {
 	if (bma->wasdel) {
 		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
 		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
 		bma->offset = bma->got.br_startoff;
 		bma->offset = bma->got.br_startoff;
-		if (bma->idx != NULLEXTNUM && bma->idx) {
+		if (bma->idx) {
 			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
 			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
 					 &bma->prev);
 					 &bma->prev);
 		}
 		}
@@ -4563,7 +4512,7 @@ xfs_bmapi_write(
 	struct xfs_ifork	*ifp;
 	struct xfs_ifork	*ifp;
 	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
 	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	xfs_fileoff_t		end;		/* end of mapped file region */
-	int			eof;		/* after the end of extents */
+	bool			eof = false;	/* after the end of extents */
 	int			error;		/* error return */
 	int			error;		/* error return */
 	int			n;		/* current extent index */
 	int			n;		/* current extent index */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
@@ -4641,12 +4590,14 @@ xfs_bmapi_write(
 			goto error0;
 			goto error0;
 	}
 	}
 
 
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
-				&bma.prev);
 	n = 0;
 	n = 0;
 	end = bno + len;
 	end = bno + len;
 	obno = bno;
 	obno = bno;
 
 
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got))
+		eof = true;
+	if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev))
+		bma.prev.br_startoff = NULLFILEOFF;
 	bma.tp = tp;
 	bma.tp = tp;
 	bma.ip = ip;
 	bma.ip = ip;
 	bma.total = total;
 	bma.total = total;
@@ -4733,11 +4684,8 @@ xfs_bmapi_write(
 
 
 		/* Else go on to the next record. */
 		/* Else go on to the next record. */
 		bma.prev = bma.got;
 		bma.prev = bma.got;
-		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
-					 &bma.got);
-		} else
-			eof = 1;
+		if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got))
+			eof = true;
 	}
 	}
 	*nmap = n;
 	*nmap = n;
 
 
@@ -4885,7 +4833,7 @@ xfs_bmap_del_extent_delay(
 	da_new = 0;
 	da_new = 0;
 
 
 	ASSERT(*idx >= 0);
 	ASSERT(*idx >= 0);
-	ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got_endoff >= del_endoff);
 	ASSERT(got_endoff >= del_endoff);
@@ -4902,8 +4850,11 @@ xfs_bmap_del_extent_delay(
 	 * sb counters as we might have to borrow some blocks for the
 	 * sb counters as we might have to borrow some blocks for the
 	 * indirect block accounting.
 	 * indirect block accounting.
 	 */
 	 */
-	xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0,
+	error = xfs_trans_reserve_quota_nblks(NULL, ip,
+			-((long)del->br_blockcount), 0,
 			isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
 			isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		return error;
 	ip->i_delayed_blks -= del->br_blockcount;
 	ip->i_delayed_blks -= del->br_blockcount;
 
 
 	if (whichfork == XFS_COW_FORK)
 	if (whichfork == XFS_COW_FORK)
@@ -5013,7 +4964,7 @@ xfs_bmap_del_extent_cow(
 	got_endoff = got->br_startoff + got->br_blockcount;
 	got_endoff = got->br_startoff + got->br_blockcount;
 
 
 	ASSERT(*idx >= 0);
 	ASSERT(*idx >= 0);
-	ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got_endoff >= del_endoff);
 	ASSERT(got_endoff >= del_endoff);
@@ -5119,8 +5070,7 @@ xfs_bmap_del_extent(
 		state |= BMAP_COWFORK;
 		state |= BMAP_COWFORK;
 
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
-		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp)));
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(del->br_blockcount > 0);
 	ep = xfs_iext_get_ext(ifp, *idx);
 	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &got);
 	xfs_bmbt_get_all(ep, &got);
@@ -5434,8 +5384,6 @@ __xfs_bunmapi(
 {
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
 	xfs_bmbt_irec_t		del;		/* extent being deleted */
 	xfs_bmbt_irec_t		del;		/* extent being deleted */
-	int			eof;		/* is deleting at eof */
-	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
 	int			error;		/* error return value */
 	int			error;		/* error return value */
 	xfs_extnum_t		extno;		/* extent number in list */
 	xfs_extnum_t		extno;		/* extent number in list */
 	xfs_bmbt_irec_t		got;		/* current extent record */
 	xfs_bmbt_irec_t		got;		/* current extent record */
@@ -5445,8 +5393,6 @@ __xfs_bunmapi(
 	int			logflags;	/* transaction logging flags */
 	int			logflags;	/* transaction logging flags */
 	xfs_extlen_t		mod;		/* rt extent offset */
 	xfs_extlen_t		mod;		/* rt extent offset */
 	xfs_mount_t		*mp;		/* mount structure */
 	xfs_mount_t		*mp;		/* mount structure */
-	xfs_extnum_t		nextents;	/* number of file extents */
-	xfs_bmbt_irec_t		prev;		/* previous extent record */
 	xfs_fileoff_t		start;		/* first file offset deleted */
 	xfs_fileoff_t		start;		/* first file offset deleted */
 	int			tmp_logflags;	/* partial logging flags */
 	int			tmp_logflags;	/* partial logging flags */
 	int			wasdel;		/* was a delayed alloc extent */
 	int			wasdel;		/* was a delayed alloc extent */
@@ -5477,8 +5423,7 @@ __xfs_bunmapi(
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
 		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
+	if (xfs_iext_count(ifp) == 0) {
 		*rlen = 0;
 		*rlen = 0;
 		return 0;
 		return 0;
 	}
 	}
@@ -5486,18 +5431,17 @@ __xfs_bunmapi(
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	start = bno;
 	start = bno;
 	bno = start + len - 1;
 	bno = start + len - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
 
 
 	/*
 	/*
 	 * Check to see if the given block number is past the end of the
 	 * Check to see if the given block number is past the end of the
 	 * file, back up to the last block if so...
 	 * file, back up to the last block if so...
 	 */
 	 */
-	if (eof) {
-		ep = xfs_iext_get_ext(ifp, --lastx);
-		xfs_bmbt_get_all(ep, &got);
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) {
+		ASSERT(lastx > 0);
+		xfs_iext_get_extent(ifp, --lastx, &got);
 		bno = got.br_startoff + got.br_blockcount - 1;
 		bno = got.br_startoff + got.br_blockcount - 1;
 	}
 	}
+
 	logflags = 0;
 	logflags = 0;
 	if (ifp->if_flags & XFS_IFBROOT) {
 	if (ifp->if_flags & XFS_IFBROOT) {
 		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
 		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
@@ -5528,8 +5472,7 @@ __xfs_bunmapi(
 		if (got.br_startoff > bno) {
 		if (got.br_startoff > bno) {
 			if (--lastx < 0)
 			if (--lastx < 0)
 				break;
 				break;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
+			xfs_iext_get_extent(ifp, lastx, &got);
 		}
 		}
 		/*
 		/*
 		 * Is the last block of this extent before the range
 		 * Is the last block of this extent before the range
@@ -5543,7 +5486,6 @@ __xfs_bunmapi(
 		 * Then deal with the (possibly delayed) allocated space
 		 * Then deal with the (possibly delayed) allocated space
 		 * we found.
 		 * we found.
 		 */
 		 */
-		ASSERT(ep != NULL);
 		del = got;
 		del = got;
 		wasdel = isnullstartblock(del.br_startblock);
 		wasdel = isnullstartblock(del.br_startblock);
 		if (got.br_startoff < start) {
 		if (got.br_startoff < start) {
@@ -5624,15 +5566,12 @@ __xfs_bunmapi(
 				 */
 				 */
 				ASSERT(bno >= del.br_blockcount);
 				ASSERT(bno >= del.br_blockcount);
 				bno -= del.br_blockcount;
 				bno -= del.br_blockcount;
-				if (got.br_startoff > bno) {
-					if (--lastx >= 0) {
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-						xfs_bmbt_get_all(ep, &got);
-					}
-				}
+				if (got.br_startoff > bno && --lastx >= 0)
+					xfs_iext_get_extent(ifp, lastx, &got);
 				continue;
 				continue;
 			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
 			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				struct xfs_bmbt_irec	prev;
+
 				/*
 				/*
 				 * This one is already unwritten.
 				 * This one is already unwritten.
 				 * It must have a written left neighbor.
 				 * It must have a written left neighbor.
@@ -5640,8 +5579,7 @@ __xfs_bunmapi(
 				 * try again.
 				 * try again.
 				 */
 				 */
 				ASSERT(lastx > 0);
 				ASSERT(lastx > 0);
-				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-						lastx - 1), &prev);
+				xfs_iext_get_extent(ifp, lastx - 1, &prev);
 				ASSERT(prev.br_state == XFS_EXT_NORM);
 				ASSERT(prev.br_state == XFS_EXT_NORM);
 				ASSERT(!isnullstartblock(prev.br_startblock));
 				ASSERT(!isnullstartblock(prev.br_startblock));
 				ASSERT(del.br_startblock ==
 				ASSERT(del.br_startblock ==
@@ -5739,13 +5677,9 @@ nodelete:
 		 */
 		 */
 		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
 		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
 			if (lastx >= 0) {
 			if (lastx >= 0) {
-				ep = xfs_iext_get_ext(ifp, lastx);
-				if (xfs_bmbt_get_startoff(ep) > bno) {
-					if (--lastx >= 0)
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-				}
-				xfs_bmbt_get_all(ep, &got);
+				xfs_iext_get_extent(ifp, lastx, &got);
+				if (got.br_startoff > bno && --lastx >= 0)
+					xfs_iext_get_extent(ifp, lastx, &got);
 			}
 			}
 			extno++;
 			extno++;
 		}
 		}
@@ -5963,7 +5897,7 @@ xfs_bmse_shift_one(
 
 
 	mp = ip->i_mount;
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	total_extents = xfs_iext_count(ifp);
 
 
 	xfs_bmbt_get_all(gotp, &got);
 	xfs_bmbt_get_all(gotp, &got);
 
 
@@ -6140,7 +6074,7 @@ xfs_bmap_shift_extents(
 	 * are collapsing out, so we cannot use the count of real extents here.
 	 * are collapsing out, so we cannot use the count of real extents here.
 	 * Instead we have to calculate it from the incore fork.
 	 * Instead we have to calculate it from the incore fork.
 	 */
 	 */
-	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	total_extents = xfs_iext_count(ifp);
 	if (total_extents == 0) {
 	if (total_extents == 0) {
 		*done = 1;
 		*done = 1;
 		goto del_cursor;
 		goto del_cursor;
@@ -6200,7 +6134,7 @@ xfs_bmap_shift_extents(
 		 * count can change. Update the total and grade the next record.
 		 * count can change. Update the total and grade the next record.
 		 */
 		 */
 		if (direction == SHIFT_LEFT) {
 		if (direction == SHIFT_LEFT) {
-			total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+			total_extents = xfs_iext_count(ifp);
 			stop_extent = total_extents;
 			stop_extent = total_extents;
 		}
 		}
 
 

+ 2 - 7
fs/xfs/libxfs/xfs_bmap.h

@@ -237,14 +237,9 @@ int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_defer_ops *dfops, enum shift_direction direction,
 		struct xfs_defer_ops *dfops, enum shift_direction direction,
 		int num_exts);
 		int num_exts);
 int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
 int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
-struct xfs_bmbt_rec_host *
-	xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
-		int fork, int *eofp, xfs_extnum_t *lastxp,
-		struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
 int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
 int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
-		xfs_fileoff_t aoff, xfs_filblks_t len,
-		struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
-		xfs_extnum_t *lastx, int eof);
+		xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
+		struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
 
 
 enum xfs_bmap_intent_type {
 enum xfs_bmap_intent_type {
 	XFS_BMAP_MAP = 1,
 	XFS_BMAP_MAP = 1,

+ 2 - 1
fs/xfs/libxfs/xfs_bmap_btree.c

@@ -796,13 +796,14 @@ xfs_bmbt_init_cursor(
 	struct xfs_btree_cur	*cur;
 	struct xfs_btree_cur	*cur;
 	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(whichfork != XFS_COW_FORK);
 
 
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
 
 	cur->bc_tp = tp;
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_mp = mp;
 	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 	cur->bc_btnum = XFS_BTNUM_BMAP;
 	cur->bc_btnum = XFS_BTNUM_BMAP;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
 
 
 	cur->bc_ops = &xfs_bmbt_ops;
 	cur->bc_ops = &xfs_bmbt_ops;
 	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
 	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;

+ 20 - 0
fs/xfs/libxfs/xfs_btree.c

@@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block(
 	if (error)
 	if (error)
 		return error;
 		return error;
 
 
+	/* Check the inode owner since the verifiers don't. */
+	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+			cur->bc_private.b.ip->i_ino)
+		goto out_bad;
+
+	/* Did we get the level we were looking for? */
+	if (be16_to_cpu((*blkp)->bb_level) != level)
+		goto out_bad;
+
+	/* Check that internal nodes have at least one record. */
+	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
+		goto out_bad;
+
 	xfs_btree_setbuf(cur, level, bp);
 	xfs_btree_setbuf(cur, level, bp);
 	return 0;
 	return 0;
+
+out_bad:
+	*blkp = NULL;
+	xfs_trans_brelse(cur->bc_tp, bp);
+	return -EFSCORRUPTED;
 }
 }
 
 
 /*
 /*

+ 4 - 39
fs/xfs/libxfs/xfs_btree.h

@@ -96,46 +96,10 @@ union xfs_btree_rec {
 /*
 /*
  * Generic stats interface
  * Generic stats interface
  */
  */
-#define __XFS_BTREE_STATS_INC(mp, type, stat) \
-	XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
 #define XFS_BTREE_STATS_INC(cur, stat)	\
 #define XFS_BTREE_STATS_INC(cur, stat)	\
-do {    \
-	struct xfs_mount *__mp = cur->bc_mp; \
-	switch (cur->bc_btnum) {  \
-	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
-	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
-	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
-	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
-	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
-	case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
-	case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
-	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
-	}       \
-} while (0)
-
-#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
-	XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
-#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
-do {    \
-	struct xfs_mount *__mp = cur->bc_mp; \
-	switch (cur->bc_btnum) {  \
-	case XFS_BTNUM_BNO:	\
-		__XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
-	case XFS_BTNUM_CNT:	\
-		__XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
-	case XFS_BTNUM_BMAP:	\
-		__XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
-	case XFS_BTNUM_INO:	\
-		__XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
-	case XFS_BTNUM_FINO:	\
-		__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
-	case XFS_BTNUM_RMAP:	\
-		__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
-	case XFS_BTNUM_REFC:	\
-		__XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
-	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
-	}       \
-} while (0)
+	XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)	\
+	XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
 
 
 #define	XFS_BTREE_MAXLEVELS	9	/* max of all btrees */
 #define	XFS_BTREE_MAXLEVELS	9	/* max of all btrees */
 
 
@@ -253,6 +217,7 @@ typedef struct xfs_btree_cur
 	__uint8_t	bc_nlevels;	/* number of levels in the tree */
 	__uint8_t	bc_nlevels;	/* number of levels in the tree */
 	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
 	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
+	int		bc_statoff;	/* offset of btre stats array */
 	union {
 	union {
 		struct {			/* needed for BNO, CNT, INO */
 		struct {			/* needed for BNO, CNT, INO */
 			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
 			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */

+ 22 - 4
fs/xfs/libxfs/xfs_cksum.h

@@ -6,10 +6,11 @@
 /*
 /*
  * Calculate the intermediate checksum for a buffer that has the CRC field
  * Calculate the intermediate checksum for a buffer that has the CRC field
  * inside it.  The offset of the 32bit crc fields is passed as the
  * inside it.  The offset of the 32bit crc fields is passed as the
- * cksum_offset parameter.
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
  */
  */
 static inline __uint32_t
 static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
 {
 {
 	__uint32_t zero = 0;
 	__uint32_t zero = 0;
 	__uint32_t crc;
 	__uint32_t crc;
@@ -25,6 +26,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 		      length - (cksum_offset + sizeof(__be32)));
 		      length - (cksum_offset + sizeof(__be32)));
 }
 }
 
 
+/*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline __uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	/* zero the CRC field */
+	*(__le32 *)(buffer + cksum_offset) = 0;
+
+	/* single pass CRC calculation for the entire buffer */
+	return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
 /*
 /*
  * Convert the intermediate checksum to the final ondisk format.
  * Convert the intermediate checksum to the final ondisk format.
  *
  *
@@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc)
 
 
 /*
 /*
  * Helper to generate the checksum for a buffer.
  * Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
  */
  */
 static inline void
 static inline void
 xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 {
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
 
 
 	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
 	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
 }
 }
@@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 static inline int
 static inline int
 xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 {
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
 
 
 	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
 	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
 }
 }

+ 1 - 1
fs/xfs/libxfs/xfs_dir2.c

@@ -93,7 +93,7 @@ xfs_ascii_ci_compname(
 	return result;
 	return result;
 }
 }
 
 
-static struct xfs_nameops xfs_ascii_ci_nameops = {
+static const struct xfs_nameops xfs_ascii_ci_nameops = {
 	.hashname	= xfs_ascii_ci_hashname,
 	.hashname	= xfs_ascii_ci_hashname,
 	.compname	= xfs_ascii_ci_compname,
 	.compname	= xfs_ascii_ci_compname,
 };
 };

+ 5 - 0
fs/xfs/libxfs/xfs_dir2.h

@@ -157,6 +157,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 				struct xfs_buf *bp);
 				struct xfs_buf *bp);
 
 
+extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo,
+		const struct xfs_dir_ops *ops,
+		struct xfs_dir2_data_hdr *hdr, int *loghead);
 extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
 extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
 		struct xfs_dir2_data_hdr *hdr, int *loghead);
 		struct xfs_dir2_data_hdr *hdr, int *loghead);
 extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
 extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
@@ -177,6 +180,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
 		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
 		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
 		struct xfs_dir2_data_unused *dup);
 		struct xfs_dir2_data_unused *dup);
 
 
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;

+ 18 - 8
fs/xfs/libxfs/xfs_dir2_data.c

@@ -329,7 +329,7 @@ xfs_dir3_data_read(
 
 
 	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
 	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
 				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
 				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
-	if (!err && tp)
+	if (!err && tp && *bpp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
 	return err;
 	return err;
 }
 }
@@ -505,8 +505,9 @@ xfs_dir2_data_freeremove(
  * Given a data block, reconstruct its bestfree map.
  * Given a data block, reconstruct its bestfree map.
  */
  */
 void
 void
-xfs_dir2_data_freescan(
-	struct xfs_inode	*dp,
+xfs_dir2_data_freescan_int(
+	struct xfs_da_geometry	*geo,
+	const struct xfs_dir_ops *ops,
 	struct xfs_dir2_data_hdr *hdr,
 	struct xfs_dir2_data_hdr *hdr,
 	int			*loghead)
 	int			*loghead)
 {
 {
@@ -516,7 +517,6 @@ xfs_dir2_data_freescan(
 	struct xfs_dir2_data_free *bf;
 	struct xfs_dir2_data_free *bf;
 	char			*endp;		/* end of block's data */
 	char			*endp;		/* end of block's data */
 	char			*p;		/* current entry pointer */
 	char			*p;		/* current entry pointer */
-	struct xfs_da_geometry	*geo = dp->i_mount->m_dir_geo;
 
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -526,13 +526,13 @@ xfs_dir2_data_freescan(
 	/*
 	/*
 	 * Start by clearing the table.
 	 * Start by clearing the table.
 	 */
 	 */
-	bf = dp->d_ops->data_bestfree_p(hdr);
+	bf = ops->data_bestfree_p(hdr);
 	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
 	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
 	*loghead = 1;
 	*loghead = 1;
 	/*
 	/*
 	 * Set up pointers.
 	 * Set up pointers.
 	 */
 	 */
-	p = (char *)dp->d_ops->data_entry_p(hdr);
+	p = (char *)ops->data_entry_p(hdr);
 	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
 	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
 	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 		btp = xfs_dir2_block_tail_p(geo, hdr);
 		btp = xfs_dir2_block_tail_p(geo, hdr);
@@ -559,12 +559,22 @@ xfs_dir2_data_freescan(
 		else {
 		else {
 			dep = (xfs_dir2_data_entry_t *)p;
 			dep = (xfs_dir2_data_entry_t *)p;
 			ASSERT((char *)dep - (char *)hdr ==
 			ASSERT((char *)dep - (char *)hdr ==
-			       be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
-			p += dp->d_ops->data_entsize(dep->namelen);
+			       be16_to_cpu(*ops->data_entry_tag_p(dep)));
+			p += ops->data_entsize(dep->namelen);
 		}
 		}
 	}
 	}
 }
 }
 
 
+void
+xfs_dir2_data_freescan(
+	struct xfs_inode	*dp,
+	struct xfs_dir2_data_hdr *hdr,
+	int			*loghead)
+{
+	return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops,
+			hdr, loghead);
+}
+
 /*
 /*
  * Initialize a data block at the given block number in the directory.
  * Initialize a data block at the given block number in the directory.
  * Give back the buffer for the created block.
  * Give back the buffer for the created block.

+ 0 - 1
fs/xfs/libxfs/xfs_dir2_priv.h

@@ -21,7 +21,6 @@
 struct dir_context;
 struct dir_context;
 
 
 /* xfs_dir2.c */
 /* xfs_dir2.c */
-extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
 				xfs_dir2_db_t *dbp);
 				xfs_dir2_db_t *dbp);
 extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
 extern int xfs_dir_cilookup_result(struct xfs_da_args *args,

+ 13 - 5
fs/xfs/libxfs/xfs_ialloc.c

@@ -2344,7 +2344,8 @@ xfs_imap(
 
 
 		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
 		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
 		imap->im_len = XFS_FSB_TO_BB(mp, 1);
 		imap->im_len = XFS_FSB_TO_BB(mp, 1);
-		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+		imap->im_boffset = (unsigned short)(offset <<
+							mp->m_sb.sb_inodelog);
 		return 0;
 		return 0;
 	}
 	}
 
 
@@ -2372,7 +2373,7 @@ out_map:
 
 
 	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
 	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
 	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
 	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+	imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
 
 
 	/*
 	/*
 	 * If the inode number maps to a block outside the bounds
 	 * If the inode number maps to a block outside the bounds
@@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi(
 	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 #endif
 #endif
 
 
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
-
 	/*
 	/*
 	 * Compute byte offsets for the first and last fields in the first
 	 * Compute byte offsets for the first and last fields in the first
 	 * region and log the agi buffer. This only logs up through
 	 * region and log the agi buffer. This only logs up through
@@ -2512,8 +2511,15 @@ xfs_agi_verify(
 	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
 	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
 		return false;
 		return false;
 
 
-	if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+	if (be32_to_cpu(agi->agi_level) < 1 ||
+	    be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+		return false;
+
+	if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+	    (be32_to_cpu(agi->agi_free_level) < 1 ||
+	     be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS))
 		return false;
 		return false;
+
 	/*
 	/*
 	 * during growfs operations, the perag is not fully initialised,
 	 * during growfs operations, the perag is not fully initialised,
 	 * so we can't use it for any useful checking. growfs ensures we can't
 	 * so we can't use it for any useful checking. growfs ensures we can't
@@ -2592,6 +2598,8 @@ xfs_read_agi(
 			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
 			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
 	if (error)
 	if (error)
 		return error;
 		return error;
+	if (tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
 
 
 	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
 	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
 	return 0;
 	return 0;

+ 3 - 1
fs/xfs/libxfs/xfs_ialloc_btree.c

@@ -357,7 +357,7 @@ xfs_inobt_init_cursor(
 	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
 	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
 	struct xfs_btree_cur	*cur;
 	struct xfs_btree_cur	*cur;
 
 
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
 
 	cur->bc_tp = tp;
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_mp = mp;
@@ -365,9 +365,11 @@ xfs_inobt_init_cursor(
 	if (btnum == XFS_BTNUM_INO) {
 	if (btnum == XFS_BTNUM_INO) {
 		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
 		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
 		cur->bc_ops = &xfs_inobt_ops;
 		cur->bc_ops = &xfs_inobt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
 	} else {
 	} else {
 		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
 		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
 		cur->bc_ops = &xfs_finobt_ops;
 		cur->bc_ops = &xfs_finobt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
 	}
 	}
 
 
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;

+ 12 - 4
fs/xfs/libxfs/xfs_inode_buf.c

@@ -383,7 +383,7 @@ xfs_log_dinode_to_disk(
 static bool
 static bool
 xfs_dinode_verify(
 xfs_dinode_verify(
 	struct xfs_mount	*mp,
 	struct xfs_mount	*mp,
-	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
 	struct xfs_dinode	*dip)
 	struct xfs_dinode	*dip)
 {
 {
 	uint16_t		flags;
 	uint16_t		flags;
@@ -392,6 +392,14 @@ xfs_dinode_verify(
 	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 		return false;
 		return false;
 
 
+	/* don't allow invalid i_size */
+	if (be64_to_cpu(dip->di_size) & (1ULL << 63))
+		return false;
+
+	/* No zero-length symlinks. */
+	if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0)
+		return false;
+
 	/* only version 3 or greater inodes are extensively verified here */
 	/* only version 3 or greater inodes are extensively verified here */
 	if (dip->di_version < 3)
 	if (dip->di_version < 3)
 		return true;
 		return true;
@@ -401,7 +409,7 @@ xfs_dinode_verify(
 	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 			      XFS_DINODE_CRC_OFF))
 			      XFS_DINODE_CRC_OFF))
 		return false;
 		return false;
-	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+	if (be64_to_cpu(dip->di_ino) != ino)
 		return false;
 		return false;
 	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 		return false;
 		return false;
@@ -436,7 +444,7 @@ xfs_dinode_calc_crc(
 		return;
 		return;
 
 
 	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
 	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+	crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
 			      XFS_DINODE_CRC_OFF);
 			      XFS_DINODE_CRC_OFF);
 	dip->di_crc = xfs_end_cksum(crc);
 	dip->di_crc = xfs_end_cksum(crc);
 }
 }
@@ -493,7 +501,7 @@ xfs_iread(
 		return error;
 		return error;
 
 
 	/* even unallocated inodes are verified */
 	/* even unallocated inodes are verified */
-	if (!xfs_dinode_verify(mp, ip, dip)) {
+	if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
 		xfs_alert(mp, "%s: validation failed for inode %lld failed",
 		xfs_alert(mp, "%s: validation failed for inode %lld failed",
 				__func__, ip->i_ino);
 				__func__, ip->i_ino);
 
 

+ 2 - 2
fs/xfs/libxfs/xfs_inode_buf.h

@@ -58,8 +58,8 @@ struct xfs_icdinode {
  */
  */
 struct xfs_imap {
 struct xfs_imap {
 	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
 	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
-	ushort		im_len;		/* length in BBs of inode chunk */
-	ushort		im_boffset;	/* inode offset in block in bytes */
+	unsigned short	im_len;		/* length in BBs of inode chunk */
+	unsigned short	im_boffset;	/* inode offset in block in bytes */
 };
 };
 
 
 int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,

+ 65 - 12
fs/xfs/libxfs/xfs_inode_fork.c

@@ -775,6 +775,13 @@ xfs_idestroy_fork(
 	}
 	}
 }
 }
 
 
+/* Count number of incore extents based on if_bytes */
+xfs_extnum_t
+xfs_iext_count(struct xfs_ifork *ifp)
+{
+	return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+}
+
 /*
 /*
  * Convert in-core extents to on-disk form
  * Convert in-core extents to on-disk form
  *
  *
@@ -803,7 +810,7 @@ xfs_iextents_copy(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(ifp->if_bytes > 0);
 	ASSERT(ifp->if_bytes > 0);
 
 
-	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nrecs = xfs_iext_count(ifp);
 	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
 	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
 	ASSERT(nrecs > 0);
 	ASSERT(nrecs > 0);
 
 
@@ -941,7 +948,7 @@ xfs_iext_get_ext(
 	xfs_extnum_t	idx)		/* index of target extent */
 	xfs_extnum_t	idx)		/* index of target extent */
 {
 {
 	ASSERT(idx >= 0);
 	ASSERT(idx >= 0);
-	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+	ASSERT(idx < xfs_iext_count(ifp));
 
 
 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
 		return ifp->if_u1.if_ext_irec->er_extbuf;
 		return ifp->if_u1.if_ext_irec->er_extbuf;
@@ -1017,7 +1024,7 @@ xfs_iext_add(
 	int		new_size;	/* size of extents after adding */
 	int		new_size;	/* size of extents after adding */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 
 
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT((idx >= 0) && (idx <= nextents));
 	ASSERT((idx >= 0) && (idx <= nextents));
 	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
 	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
 	new_size = ifp->if_bytes + byte_diff;
 	new_size = ifp->if_bytes + byte_diff;
@@ -1241,7 +1248,7 @@ xfs_iext_remove(
 	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
 	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
 
 
 	ASSERT(ext_diff > 0);
 	ASSERT(ext_diff > 0);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
 	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
 
 
 	if (new_size == 0) {
 	if (new_size == 0) {
@@ -1270,7 +1277,7 @@ xfs_iext_remove_inline(
 
 
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	ASSERT(idx < XFS_INLINE_EXTS);
 	ASSERT(idx < XFS_INLINE_EXTS);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT(((nextents - ext_diff) > 0) &&
 	ASSERT(((nextents - ext_diff) > 0) &&
 		(nextents - ext_diff) < XFS_INLINE_EXTS);
 		(nextents - ext_diff) < XFS_INLINE_EXTS);
 
 
@@ -1309,7 +1316,7 @@ xfs_iext_remove_direct(
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	new_size = ifp->if_bytes -
 	new_size = ifp->if_bytes -
 		(ext_diff * sizeof(xfs_bmbt_rec_t));
 		(ext_diff * sizeof(xfs_bmbt_rec_t));
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 
 
 	if (new_size == 0) {
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 		xfs_iext_destroy(ifp);
@@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct(
 	int		size;		/* size of file extents */
 	int		size;		/* size of file extents */
 
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 
 
@@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext(
 	xfs_extnum_t	nextents;	/* number of file extents */
 	xfs_extnum_t	nextents;	/* number of file extents */
 	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
 	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
 
 
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	if (nextents == 0) {
 	if (nextents == 0) {
 		*idxp = 0;
 		*idxp = 0;
 		return NULL;
 		return NULL;
@@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec(
 
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	ASSERT(page_idx >= 0);
 	ASSERT(page_idx >= 0);
-	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+	ASSERT(page_idx <= xfs_iext_count(ifp));
+	ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
 
 
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp_idx = 0;
 	erp_idx = 0;
@@ -1782,7 +1789,7 @@ xfs_iext_irec_init(
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 
 
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 
 
 	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
 	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
@@ -1906,7 +1913,7 @@ xfs_iext_irec_compact(
 
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 
 
 	if (nextents == 0) {
 	if (nextents == 0) {
 		xfs_iext_destroy(ifp);
 		xfs_iext_destroy(ifp);
@@ -1996,3 +2003,49 @@ xfs_ifork_init_cow(
 	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	ip->i_cnextents = 0;
 	ip->i_cnextents = 0;
 }
 }
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent index in *idx.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its index in *idx
+ * instead.
+ * If bno is beyond the last extent return false, and return the index after
+ * the last valid index in *idxp.
+ */
+bool
+xfs_iext_lookup_extent(
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		bno,
+	xfs_extnum_t		*idxp,
+	struct xfs_bmbt_irec	*gotp)
+{
+	struct xfs_bmbt_rec_host *ep;
+
+	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+	ep = xfs_iext_bno_to_ext(ifp, bno, idxp);
+	if (!ep)
+		return false;
+	xfs_bmbt_get_all(ep, gotp);
+	return true;
+}
+
+/*
+ * Return true if there is an extent at index idx, and return the expanded
+ * extent structure at idx in that case.  Else return false.
+ */
+bool
+xfs_iext_get_extent(
+	struct xfs_ifork	*ifp,
+	xfs_extnum_t		idx,
+	struct xfs_bmbt_irec	*gotp)
+{
+	if (idx < 0 || idx >= xfs_iext_count(ifp))
+		return false;
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
+	return true;
+}

+ 7 - 0
fs/xfs/libxfs/xfs_inode_fork.h

@@ -152,6 +152,7 @@ void		xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
 
 
 struct xfs_bmbt_rec_host *
 struct xfs_bmbt_rec_host *
 		xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
 		xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+xfs_extnum_t	xfs_iext_count(struct xfs_ifork *);
 void		xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
 void		xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
 				struct xfs_bmbt_irec *, int);
 				struct xfs_bmbt_irec *, int);
 void		xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
 void		xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
@@ -181,6 +182,12 @@ void		xfs_iext_irec_compact_pages(struct xfs_ifork *);
 void		xfs_iext_irec_compact_full(struct xfs_ifork *);
 void		xfs_iext_irec_compact_full(struct xfs_ifork *);
 void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
 void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
 
 
+bool		xfs_iext_lookup_extent(struct xfs_inode *ip,
+			struct xfs_ifork *ifp, xfs_fileoff_t bno,
+			xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
+bool		xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+			struct xfs_bmbt_irec *gotp);
+
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_ifork_zone;
 
 
 extern void xfs_ifork_init_cow(struct xfs_inode *ip);
 extern void xfs_ifork_init_cow(struct xfs_inode *ip);

+ 2 - 2
fs/xfs/libxfs/xfs_log_format.h

@@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version)
 typedef struct xfs_buf_log_format {
 typedef struct xfs_buf_log_format {
 	unsigned short	blf_type;	/* buf log item type indicator */
 	unsigned short	blf_type;	/* buf log item type indicator */
 	unsigned short	blf_size;	/* size of this item */
 	unsigned short	blf_size;	/* size of this item */
-	ushort		blf_flags;	/* misc state */
-	ushort		blf_len;	/* number of blocks in this buf */
+	unsigned short	blf_flags;	/* misc state */
+	unsigned short	blf_len;	/* number of blocks in this buf */
 	__int64_t	blf_blkno;	/* starting blkno of this buf */
 	__int64_t	blf_blkno;	/* starting blkno of this buf */
 	unsigned int	blf_map_size;	/* used size of data bitmap in words */
 	unsigned int	blf_map_size;	/* used size of data bitmap in words */
 	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
 	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */

+ 1 - 1
fs/xfs/libxfs/xfs_log_recover.h

@@ -52,7 +52,7 @@ typedef struct xlog_recover {
 	struct list_head	r_itemq;	/* q for items */
 	struct list_head	r_itemq;	/* q for items */
 } xlog_recover_t;
 } xlog_recover_t;
 
 
-#define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i)	(*(unsigned short *)(i)->ri_buf[0].i_addr)
 
 
 /*
 /*
  * This is the number of entries in the l_buf_cancel_table used during
  * This is the number of entries in the l_buf_cancel_table used during

+ 1 - 0
fs/xfs/libxfs/xfs_refcount_btree.c

@@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor(
 	cur->bc_btnum = XFS_BTNUM_REFC;
 	cur->bc_btnum = XFS_BTNUM_REFC;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_ops = &xfs_refcountbt_ops;
 	cur->bc_ops = &xfs_refcountbt_ops;
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
 
 
 	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
 	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
 
 

+ 1 - 0
fs/xfs/libxfs/xfs_rmap_btree.c

@@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor(
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_ops = &xfs_rmapbt_ops;
 	cur->bc_ops = &xfs_rmapbt_ops;
 	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
 	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
 
 
 	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agno = agno;
 	cur->bc_private.a.agno = agno;

+ 0 - 1
fs/xfs/libxfs/xfs_rtbitmap.c

@@ -1016,4 +1016,3 @@ xfs_rtfree_extent(
 	}
 	}
 	return 0;
 	return 0;
 }
 }
-

+ 11 - 2
fs/xfs/libxfs/xfs_sb.c

@@ -262,6 +262,12 @@ xfs_mount_validate_sb(
 		return -EFSCORRUPTED;
 		return -EFSCORRUPTED;
 	}
 	}
 
 
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+		xfs_notice(mp, "v5 SB sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
 	/*
 	/*
 	 * Until this is fixed only page-sized or smaller data blocks work.
 	 * Until this is fixed only page-sized or smaller data blocks work.
 	 */
 	 */
@@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
 					XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
 					XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
 	sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
 	sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
 
 
-	if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+	if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
+	    sbp->sb_gquotino != NULLFSINO)  {
 		/*
 		/*
 		 * In older version of superblock, on-disk superblock only
 		 * In older version of superblock, on-disk superblock only
 		 * has sb_gquotino, and in-core superblock has both sb_gquotino
 		 * has sb_gquotino, and in-core superblock has both sb_gquotino
 		 * and sb_pquotino. But, only one of them is supported at any
 		 * and sb_pquotino. But, only one of them is supported at any
 		 * point of time. So, if PQUOTA is set in disk superblock,
 		 * point of time. So, if PQUOTA is set in disk superblock,
-		 * copy over sb_gquotino to sb_pquotino.
+		 * copy over sb_gquotino to sb_pquotino.  The NULLFSINO test
+		 * above is to make sure we don't do this twice and wipe them
+		 * both out!
 		 */
 		 */
 		sbp->sb_pquotino = sbp->sb_gquotino;
 		sbp->sb_pquotino = sbp->sb_gquotino;
 		sbp->sb_gquotino = NULLFSINO;
 		sbp->sb_gquotino = NULLFSINO;

+ 3 - 1
fs/xfs/libxfs/xfs_types.h

@@ -57,7 +57,6 @@ typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
 
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
-#define	NULLEXTNUM	((xfs_extnum_t)-1)
 
 
 #define NULLCOMMITLSN	((xfs_lsn_t)-1)
 #define NULLCOMMITLSN	((xfs_lsn_t)-1)
 
 
@@ -75,11 +74,14 @@ typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
  * Minimum and maximum blocksize and sectorsize.
  * Minimum and maximum blocksize and sectorsize.
  * The blocksize upper limit is pretty much arbitrary.
  * The blocksize upper limit is pretty much arbitrary.
  * The sectorsize upper limit is due to sizeof(sb_sectsize).
  * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
+ * cannot be used.
  */
  */
 #define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
 #define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
 #define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
 #define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
 #define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
 #define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
 #define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
 #define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_CRC_BLOCKSIZE	(1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
 #define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
 #define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
 #define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
 #define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
 #define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)
 #define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)

+ 13 - 273
fs/xfs/xfs_aops.c

@@ -37,11 +37,6 @@
 #include <linux/pagevec.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/writeback.h>
 
 
-/* flags for direct write completions */
-#define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
-#define XFS_DIO_FLAG_APPEND	(1 << 1)
-#define XFS_DIO_FLAG_COW	(1 << 2)
-
 /*
 /*
  * structure owned by writepages passed to individual writepage calls
  * structure owned by writepages passed to individual writepage calls
  */
  */
@@ -776,7 +771,7 @@ xfs_map_cow(
 {
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_bmbt_irec	imap;
 	struct xfs_bmbt_irec	imap;
-	bool			is_cow = false, need_alloc = false;
+	bool			is_cow = false;
 	int			error;
 	int			error;
 
 
 	/*
 	/*
@@ -794,7 +789,7 @@ xfs_map_cow(
 	 * Else we need to check if there is a COW mapping at this offset.
 	 * Else we need to check if there is a COW mapping at this offset.
 	 */
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 
 	if (!is_cow)
 	if (!is_cow)
@@ -804,7 +799,7 @@ xfs_map_cow(
 	 * And if the COW mapping has a delayed extent here we need to
 	 * And if the COW mapping has a delayed extent here we need to
 	 * allocate real space for it now.
 	 * allocate real space for it now.
 	 */
 	 */
-	if (need_alloc) {
+	if (isnullstartblock(imap.br_startblock)) {
 		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
 		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
 				&imap);
 				&imap);
 		if (error)
 		if (error)
@@ -1174,45 +1169,6 @@ xfs_vm_releasepage(
 	return try_to_free_buffers(page);
 	return try_to_free_buffers(page);
 }
 }
 
 
-/*
- * When we map a DIO buffer, we may need to pass flags to
- * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- *
- * Note that for DIO, an IO to the highest supported file block offset (i.e.
- * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
- * bit variable. Hence if we see this overflow, we have to assume that the IO is
- * extending the file size. We won't know for sure until IO completion is run
- * and the actual max write offset is communicated to the IO completion
- * routine.
- */
-static void
-xfs_map_direct(
-	struct inode		*inode,
-	struct buffer_head	*bh_result,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset,
-	bool			is_cow)
-{
-	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
-	xfs_off_t		size = bh_result->b_size;
-
-	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
-		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
-		XFS_IO_OVERWRITE, imap);
-
-	if (ISUNWRITTEN(imap)) {
-		*flags |= XFS_DIO_FLAG_UNWRITTEN;
-		set_buffer_defer_completion(bh_result);
-	} else if (is_cow) {
-		*flags |= XFS_DIO_FLAG_COW;
-		set_buffer_defer_completion(bh_result);
-	}
-	if (offset + size > i_size_read(inode) || offset + size < 0) {
-		*flags |= XFS_DIO_FLAG_APPEND;
-		set_buffer_defer_completion(bh_result);
-	}
-}
-
 /*
 /*
  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
  * is, so that we can avoid repeated get_blocks calls.
  * is, so that we can avoid repeated get_blocks calls.
@@ -1253,51 +1209,12 @@ xfs_map_trim_size(
 	bh_result->b_size = mapping_size;
 	bh_result->b_size = mapping_size;
 }
 }
 
 
-/* Bounce unaligned directio writes to the page cache. */
 static int
 static int
-xfs_bounce_unaligned_dio_write(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		offset_fsb,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_bmbt_irec	irec;
-	xfs_fileoff_t		delta;
-	bool			shared;
-	bool			x;
-	int			error;
-
-	irec = *imap;
-	if (offset_fsb > irec.br_startoff) {
-		delta = offset_fsb - irec.br_startoff;
-		irec.br_blockcount -= delta;
-		irec.br_startblock += delta;
-		irec.br_startoff = offset_fsb;
-	}
-	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
-	if (error)
-		return error;
-
-	/*
-	 * We're here because we're trying to do a directio write to a
-	 * region that isn't aligned to a filesystem block.  If any part
-	 * of the extent is shared, fall back to buffered mode to handle
-	 * the RMW.  This is done by returning -EREMCHG ("remote addr
-	 * changed"), which is caught further up the call stack.
-	 */
-	if (shared) {
-		trace_xfs_reflink_bounce_dio_write(ip, imap);
-		return -EREMCHG;
-	}
-	return 0;
-}
-
-STATIC int
-__xfs_get_blocks(
+xfs_get_blocks(
 	struct inode		*inode,
 	struct inode		*inode,
 	sector_t		iblock,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	struct buffer_head	*bh_result,
-	int			create,
-	bool			direct)
+	int			create)
 {
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_mount	*mp = ip->i_mount;
@@ -1308,11 +1225,8 @@ __xfs_get_blocks(
 	int			nimaps = 1;
 	int			nimaps = 1;
 	xfs_off_t		offset;
 	xfs_off_t		offset;
 	ssize_t			size;
 	ssize_t			size;
-	int			new = 0;
-	bool			is_cow = false;
-	bool			need_alloc = false;
 
 
-	BUG_ON(create && !direct);
+	BUG_ON(create);
 
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 		return -EIO;
@@ -1321,7 +1235,7 @@ __xfs_get_blocks(
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
 	size = bh_result->b_size;
 
 
-	if (!create && offset >= i_size_read(inode))
+	if (offset >= i_size_read(inode))
 		return 0;
 		return 0;
 
 
 	/*
 	/*
@@ -1336,52 +1250,12 @@ __xfs_get_blocks(
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
 
-	if (create && direct && xfs_is_reflink_inode(ip))
-		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
-					&need_alloc);
-	if (!is_cow) {
-		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-					&imap, &nimaps, XFS_BMAPI_ENTIRE);
-		/*
-		 * Truncate an overwrite extent if there's a pending CoW
-		 * reservation before the end of this extent.  This
-		 * forces us to come back to get_blocks to take care of
-		 * the CoW.
-		 */
-		if (create && direct && nimaps &&
-		    imap.br_startblock != HOLESTARTBLOCK &&
-		    imap.br_startblock != DELAYSTARTBLOCK &&
-		    !ISUNWRITTEN(&imap))
-			xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
-					&imap);
-	}
-	ASSERT(!need_alloc);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, XFS_BMAPI_ENTIRE);
 	if (error)
 	if (error)
 		goto out_unlock;
 		goto out_unlock;
 
 
-	/* for DAX, we convert unwritten extents directly */
-	if (create &&
-	    (!nimaps ||
-	     (imap.br_startblock == HOLESTARTBLOCK ||
-	      imap.br_startblock == DELAYSTARTBLOCK) ||
-	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-		/*
-		 * xfs_iomap_write_direct() expects the shared lock. It
-		 * is unlocked on return.
-		 */
-		if (lockmode == XFS_ILOCK_EXCL)
-			xfs_ilock_demote(ip, lockmode);
-
-		error = xfs_iomap_write_direct(ip, offset, size,
-					       &imap, nimaps);
-		if (error)
-			return error;
-		new = 1;
-
-		trace_xfs_get_blocks_alloc(ip, offset, size,
-				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
-						   : XFS_IO_DELALLOC, &imap);
-	} else if (nimaps) {
+	if (nimaps) {
 		trace_xfs_get_blocks_found(ip, offset, size,
 		trace_xfs_get_blocks_found(ip, offset, size,
 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
 						   : XFS_IO_OVERWRITE, &imap);
 						   : XFS_IO_OVERWRITE, &imap);
@@ -1391,12 +1265,6 @@ __xfs_get_blocks(
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
 
 
-	if (IS_DAX(inode) && create) {
-		ASSERT(!ISUNWRITTEN(&imap));
-		/* zeroing is not needed at a higher layer */
-		new = 0;
-	}
-
 	/* trim mapping down to size requested */
 	/* trim mapping down to size requested */
 	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
 	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
 
 
@@ -1406,45 +1274,14 @@ __xfs_get_blocks(
 	 */
 	 */
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK &&
-	    (create || !ISUNWRITTEN(&imap))) {
-		if (create && direct && !is_cow) {
-			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
-					&imap);
-			if (error)
-				return error;
-		}
-
+	    !ISUNWRITTEN(&imap))
 		xfs_map_buffer(inode, bh_result, &imap, offset);
 		xfs_map_buffer(inode, bh_result, &imap, offset);
-		if (ISUNWRITTEN(&imap))
-			set_buffer_unwritten(bh_result);
-		/* direct IO needs special help */
-		if (create)
-			xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
-	}
 
 
 	/*
 	/*
 	 * If this is a realtime file, data may be on a different device.
 	 * If this is a realtime file, data may be on a different device.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 */
 	 */
 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-
-	/*
-	 * If we previously allocated a block out beyond eof and we are now
-	 * coming back to use it then we will need to flag it as new even if it
-	 * has a disk address.
-	 *
-	 * With sub-block writes into unwritten extents we also need to mark
-	 * the buffer as new so that the unwritten parts of the buffer gets
-	 * correctly zeroed.
-	 */
-	if (create &&
-	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
-	     (offset >= i_size_read(inode)) ||
-	     (new || ISUNWRITTEN(&imap))))
-		set_buffer_new(bh_result);
-
-	BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-
 	return 0;
 	return 0;
 
 
 out_unlock:
 out_unlock:
@@ -1452,100 +1289,6 @@ out_unlock:
 	return error;
 	return error;
 }
 }
 
 
-int
-xfs_get_blocks(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
-}
-
-int
-xfs_get_blocks_direct(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
-}
-
-/*
- * Complete a direct I/O write request.
- *
- * xfs_map_direct passes us some flags in the private data to tell us what to
- * do.  If no flags are set, then the write IO is an overwrite wholly within
- * the existing allocated file size and so there is nothing for us to do.
- *
- * Note that in this case the completion can be called in interrupt context,
- * whereas if we have flags set we will always be called in task context
- * (i.e. from a workqueue).
- */
-int
-xfs_end_io_direct_write(
-	struct kiocb		*iocb,
-	loff_t			offset,
-	ssize_t			size,
-	void			*private)
-{
-	struct inode		*inode = file_inode(iocb->ki_filp);
-	struct xfs_inode	*ip = XFS_I(inode);
-	uintptr_t		flags = (uintptr_t)private;
-	int			error = 0;
-
-	trace_xfs_end_io_direct_write(ip, offset, size);
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	if (size <= 0)
-		return size;
-
-	/*
-	 * The flags tell us whether we are doing unwritten extent conversions
-	 * or an append transaction that updates the on-disk file size. These
-	 * cases are the only cases where we should *potentially* be needing
-	 * to update the VFS inode size.
-	 */
-	if (flags == 0) {
-		ASSERT(offset + size <= i_size_read(inode));
-		return 0;
-	}
-
-	/*
-	 * We need to update the in-core inode size here so that we don't end up
-	 * with the on-disk inode size being outside the in-core inode size. We
-	 * have no other method of updating EOF for AIO, so always do it here
-	 * if necessary.
-	 *
-	 * We need to lock the test/set EOF update as we can be racing with
-	 * other IO completions here to update the EOF. Failing to serialise
-	 * here can result in EOF moving backwards and Bad Things Happen when
-	 * that occurs.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (offset + size > i_size_read(inode))
-		i_size_write(inode, offset + size);
-	spin_unlock(&ip->i_flags_lock);
-
-	if (flags & XFS_DIO_FLAG_COW)
-		error = xfs_reflink_end_cow(ip, offset, size);
-	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
-		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-
-		error = xfs_iomap_write_unwritten(ip, offset, size);
-	}
-	if (flags & XFS_DIO_FLAG_APPEND) {
-		trace_xfs_end_io_direct_write_append(ip, offset, size);
-
-		error = xfs_setfilesize(ip, offset, size);
-	}
-
-	return error;
-}
-
 STATIC ssize_t
 STATIC ssize_t
 xfs_vm_direct_IO(
 xfs_vm_direct_IO(
 	struct kiocb		*iocb,
 	struct kiocb		*iocb,
@@ -1566,7 +1309,6 @@ xfs_vm_bmap(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_inode	*ip = XFS_I(inode);
 
 
 	trace_xfs_vm_bmap(XFS_I(inode));
 	trace_xfs_vm_bmap(XFS_I(inode));
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 
 	/*
 	/*
 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1574,12 +1316,10 @@ xfs_vm_bmap(
 	 * that on reflinks inodes, so we have to skip out here.  And yes,
 	 * that on reflinks inodes, so we have to skip out here.  And yes,
 	 * 0 is the magic code for a bmap error..
 	 * 0 is the magic code for a bmap error..
 	 */
 	 */
-	if (xfs_is_reflink_inode(ip)) {
-		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	if (xfs_is_reflink_inode(ip))
 		return 0;
 		return 0;
-	}
+
 	filemap_write_and_wait(mapping);
 	filemap_write_and_wait(mapping);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 }
 
 

+ 0 - 6
fs/xfs/xfs_aops.h

@@ -55,12 +55,6 @@ struct xfs_ioend {
 
 
 extern const struct address_space_operations xfs_address_space_operations;
 extern const struct address_space_operations xfs_address_space_operations;
 
 
-int	xfs_get_blocks(struct inode *inode, sector_t offset,
-		       struct buffer_head *map_bh, int create);
-int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
-			      struct buffer_head *map_bh, int create);
-int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
-		ssize_t size, void *private);
 int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern void xfs_count_page_state(struct page *, int *, int *);

+ 2 - 2
fs/xfs/xfs_attr.h

@@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern {
  *========================================================================*/
  *========================================================================*/
 
 
 
 
-/* Return 0 on success, or -errno; other state communicated via *context */
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
 			      unsigned char *, int, int);
 			      unsigned char *, int, int);
 
 
 typedef struct xfs_attr_list_context {
 typedef struct xfs_attr_list_context {

+ 22 - 37
fs/xfs/xfs_attr_list.c

@@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	xfs_attr_sf_entry_t *sfe;
 	xfs_attr_sf_entry_t *sfe;
 	xfs_inode_t *dp;
 	xfs_inode_t *dp;
 	int sbsize, nsbuf, count, i;
 	int sbsize, nsbuf, count, i;
-	int error;
 
 
 	ASSERT(context != NULL);
 	ASSERT(context != NULL);
 	dp = context->dp;
 	dp = context->dp;
@@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	    (XFS_ISRESET_CURSOR(cursor) &&
 	    (XFS_ISRESET_CURSOR(cursor) &&
              (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
              (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-			error = context->put_listent(context,
-					   sfe->flags,
-					   sfe->nameval,
-					   (int)sfe->namelen,
-					   (int)sfe->valuelen);
-			if (error)
-				return error;
+			context->put_listent(context,
+					     sfe->flags,
+					     sfe->nameval,
+					     (int)sfe->namelen,
+					     (int)sfe->valuelen);
 			/*
 			/*
 			 * Either search callback finished early or
 			 * Either search callback finished early or
 			 * didn't fit it all in the buffer after all.
 			 * didn't fit it all in the buffer after all.
@@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 			cursor->hashval = sbp->hash;
 			cursor->hashval = sbp->hash;
 			cursor->offset = 0;
 			cursor->offset = 0;
 		}
 		}
-		error = context->put_listent(context,
-					sbp->flags,
-					sbp->name,
-					sbp->namelen,
-					sbp->valuelen);
-		if (error) {
-			kmem_free(sbuf);
-			return error;
-		}
+		context->put_listent(context,
+				     sbp->flags,
+				     sbp->name,
+				     sbp->namelen,
+				     sbp->valuelen);
 		if (context->seen_enough)
 		if (context->seen_enough)
 			break;
 			break;
 		cursor->offset++;
 		cursor->offset++;
@@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	 */
 	for (;;) {
 	for (;;) {
 		leaf = bp->b_addr;
 		leaf = bp->b_addr;
-		error = xfs_attr3_leaf_list_int(bp, context);
-		if (error) {
-			xfs_trans_brelse(NULL, bp);
-			return error;
-		}
+		xfs_attr3_leaf_list_int(bp, context);
 		xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
 		xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
 		if (context->seen_enough || leafhdr.forw == 0)
 		if (context->seen_enough || leafhdr.forw == 0)
 			break;
 			break;
@@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 /*
 /*
  * Copy out attribute list entries for attr_list(), for leaf attribute lists.
  * Copy out attribute list entries for attr_list(), for leaf attribute lists.
  */
  */
-int
+void
 xfs_attr3_leaf_list_int(
 xfs_attr3_leaf_list_int(
 	struct xfs_buf			*bp,
 	struct xfs_buf			*bp,
 	struct xfs_attr_list_context	*context)
 	struct xfs_attr_list_context	*context)
@@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int(
 	struct xfs_attr3_icleaf_hdr	ichdr;
 	struct xfs_attr3_icleaf_hdr	ichdr;
 	struct xfs_attr_leaf_entry	*entries;
 	struct xfs_attr_leaf_entry	*entries;
 	struct xfs_attr_leaf_entry	*entry;
 	struct xfs_attr_leaf_entry	*entry;
-	int				retval;
 	int				i;
 	int				i;
 	struct xfs_mount		*mp = context->dp->i_mount;
 	struct xfs_mount		*mp = context->dp->i_mount;
 
 
@@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int(
 		}
 		}
 		if (i == ichdr.count) {
 		if (i == ichdr.count) {
 			trace_xfs_attr_list_notfound(context);
 			trace_xfs_attr_list_notfound(context);
-			return 0;
+			return;
 		}
 		}
 	} else {
 	} else {
 		entry = &entries[0];
 		entry = &entries[0];
@@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int(
 	/*
 	/*
 	 * We have found our place, start copying out the new attributes.
 	 * We have found our place, start copying out the new attributes.
 	 */
 	 */
-	retval = 0;
 	for (; i < ichdr.count; entry++, i++) {
 	for (; i < ichdr.count; entry++, i++) {
 		char *name;
 		char *name;
 		int namelen, valuelen;
 		int namelen, valuelen;
@@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int(
 			valuelen = be32_to_cpu(name_rmt->valuelen);
 			valuelen = be32_to_cpu(name_rmt->valuelen);
 		}
 		}
 
 
-		retval = context->put_listent(context, entry->flags,
+		context->put_listent(context, entry->flags,
 					      name, namelen, valuelen);
 					      name, namelen, valuelen);
-		if (retval)
-			break;
 		if (context->seen_enough)
 		if (context->seen_enough)
 			break;
 			break;
 		cursor->offset++;
 		cursor->offset++;
 	}
 	}
 	trace_xfs_attr_list_leaf_end(context);
 	trace_xfs_attr_list_leaf_end(context);
-	return retval;
+	return;
 }
 }
 
 
 /*
 /*
@@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	if (error)
 	if (error)
 		return error;
 		return error;
 
 
-	error = xfs_attr3_leaf_list_int(bp, context);
+	xfs_attr3_leaf_list_int(bp, context);
 	xfs_trans_brelse(NULL, bp);
 	xfs_trans_brelse(NULL, bp);
-	return error;
+	return 0;
 }
 }
 
 
 int
 int
@@ -513,7 +498,7 @@ xfs_attr_list_int(
  * Take care to check values and protect against them changing later,
  * Take care to check values and protect against them changing later,
  * we may be reading them directly out of a user buffer.
  * we may be reading them directly out of a user buffer.
  */
  */
-STATIC int
+STATIC void
 xfs_attr_put_listent(
 xfs_attr_put_listent(
 	xfs_attr_list_context_t *context,
 	xfs_attr_list_context_t *context,
 	int		flags,
 	int		flags,
@@ -536,10 +521,10 @@ xfs_attr_put_listent(
 	 */
 	 */
 	if (((context->flags & ATTR_SECURE) == 0) !=
 	if (((context->flags & ATTR_SECURE) == 0) !=
 	    ((flags & XFS_ATTR_SECURE) == 0))
 	    ((flags & XFS_ATTR_SECURE) == 0))
-		return 0;
+		return;
 	if (((context->flags & ATTR_ROOT) == 0) !=
 	if (((context->flags & ATTR_ROOT) == 0) !=
 	    ((flags & XFS_ATTR_ROOT) == 0))
 	    ((flags & XFS_ATTR_ROOT) == 0))
-		return 0;
+		return;
 
 
 	arraytop = sizeof(*alist) +
 	arraytop = sizeof(*alist) +
 			context->count * sizeof(alist->al_offset[0]);
 			context->count * sizeof(alist->al_offset[0]);
@@ -548,7 +533,7 @@ xfs_attr_put_listent(
 		trace_xfs_attr_list_full(context);
 		trace_xfs_attr_list_full(context);
 		alist->al_more = 1;
 		alist->al_more = 1;
 		context->seen_enough = 1;
 		context->seen_enough = 1;
-		return 0;
+		return;
 	}
 	}
 
 
 	aep = (attrlist_ent_t *)&context->alist[context->firstu];
 	aep = (attrlist_ent_t *)&context->alist[context->firstu];
@@ -558,7 +543,7 @@ xfs_attr_put_listent(
 	alist->al_offset[context->count++] = context->firstu;
 	alist->al_offset[context->count++] = context->firstu;
 	alist->al_count = context->count;
 	alist->al_count = context->count;
 	trace_xfs_attr_list_add(context);
 	trace_xfs_attr_list_add(context);
-	return 0;
+	return;
 }
 }
 
 
 /*
 /*

+ 20 - 25
fs/xfs/xfs_bmap_util.c

@@ -359,9 +359,7 @@ xfs_bmap_count_blocks(
 	mp = ip->i_mount;
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
 	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-		xfs_bmap_count_leaves(ifp, 0,
-			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-			count);
+		xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count);
 		return 0;
 		return 0;
 	}
 	}
 
 
@@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole(
 		ifp = XFS_IFORK_PTR(ip, whichfork);
 		ifp = XFS_IFORK_PTR(ip, whichfork);
 		if (!moretocome &&
 		if (!moretocome &&
 		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
 		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
-		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+		   (lastx == xfs_iext_count(ifp) - 1))
 			out->bmv_oflags |= BMV_OF_LAST;
 			out->bmv_oflags |= BMV_OF_LAST;
 	}
 	}
 
 
@@ -1792,6 +1790,7 @@ xfs_swap_extent_forks(
 	struct xfs_ifork	tempifp, *ifp, *tifp;
 	struct xfs_ifork	tempifp, *ifp, *tifp;
 	int			aforkblks = 0;
 	int			aforkblks = 0;
 	int			taforkblks = 0;
 	int			taforkblks = 0;
+	xfs_extnum_t		nextents;
 	__uint64_t		tmp;
 	__uint64_t		tmp;
 	int			error;
 	int			error;
 
 
@@ -1877,14 +1876,13 @@ xfs_swap_extent_forks(
 
 
 	switch (ip->i_d.di_format) {
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_EXTENTS:
-		/* If the extents fit in the inode, fix the
-		 * pointer.  Otherwise it's already NULL or
-		 * pointing to the extent.
+		/*
+		 * If the extents fit in the inode, fix the pointer.  Otherwise
+		 * it's already NULL or pointing to the extent.
 		 */
 		 */
-		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-			ifp->if_u1.if_extents =
-				ifp->if_u2.if_inline_ext;
-		}
+		nextents = xfs_iext_count(&ip->i_df);
+		if (nextents <= XFS_INLINE_EXTS)
+			ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 		(*src_log_flags) |= XFS_ILOG_DEXT;
 		(*src_log_flags) |= XFS_ILOG_DEXT;
 		break;
 		break;
 	case XFS_DINODE_FMT_BTREE:
 	case XFS_DINODE_FMT_BTREE:
@@ -1896,14 +1894,13 @@ xfs_swap_extent_forks(
 
 
 	switch (tip->i_d.di_format) {
 	switch (tip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_EXTENTS:
-		/* If the extents fit in the inode, fix the
-		 * pointer.  Otherwise it's already NULL or
-		 * pointing to the extent.
+		/*
+		 * If the extents fit in the inode, fix the pointer.  Otherwise
+		 * it's already NULL or pointing to the extent.
 		 */
 		 */
-		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-			tifp->if_u1.if_extents =
-				tifp->if_u2.if_inline_ext;
-		}
+		nextents = xfs_iext_count(&tip->i_df);
+		if (nextents <= XFS_INLINE_EXTS)
+			tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext;
 		(*target_log_flags) |= XFS_ILOG_DEXT;
 		(*target_log_flags) |= XFS_ILOG_DEXT;
 		break;
 		break;
 	case XFS_DINODE_FMT_BTREE:
 	case XFS_DINODE_FMT_BTREE:
@@ -1938,8 +1935,8 @@ xfs_swap_extents(
 	 * page cache safely. Once we have done this we can take the ilocks and
 	 * page cache safely. Once we have done this we can take the ilocks and
 	 * do the rest of the checks.
 	 * do the rest of the checks.
 	 */
 	 */
-	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
+	lock_flags = XFS_MMAPLOCK_EXCL;
 	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
 	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
 
 
 	/* Verify that both files have the same format */
 	/* Verify that both files have the same format */
@@ -2079,15 +2076,13 @@ xfs_swap_extents(
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
 	trace_xfs_swap_extent_after(tip, 1);
 
 
+out_unlock:
 	xfs_iunlock(ip, lock_flags);
 	xfs_iunlock(ip, lock_flags);
 	xfs_iunlock(tip, lock_flags);
 	xfs_iunlock(tip, lock_flags);
+	unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
 	return error;
 	return error;
 
 
 out_trans_cancel:
 out_trans_cancel:
 	xfs_trans_cancel(tp);
 	xfs_trans_cancel(tp);
-
-out_unlock:
-	xfs_iunlock(ip, lock_flags);
-	xfs_iunlock(tip, lock_flags);
-	return error;
+	goto out_unlock;
 }
 }

+ 74 - 49
fs/xfs/xfs_buf.c

@@ -219,7 +219,6 @@ _xfs_buf_alloc(
 	init_completion(&bp->b_iowait);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_list);
 	INIT_LIST_HEAD(&bp->b_list);
-	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	spin_lock_init(&bp->b_lock);
 	spin_lock_init(&bp->b_lock);
 	XB_SET_OWNER(bp);
 	XB_SET_OWNER(bp);
@@ -473,6 +472,62 @@ _xfs_buf_map_pages(
 /*
 /*
  *	Finding and Reading Buffers
  *	Finding and Reading Buffers
  */
  */
+static int
+_xfs_buf_obj_cmp(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const struct xfs_buf_map	*map = arg->key;
+	const struct xfs_buf		*bp = obj;
+
+	/*
+	 * The key hashing in the lookup path depends on the key being the
+	 * first element of the compare_arg, make sure to assert this.
+	 */
+	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+	if (bp->b_bn != map->bm_bn)
+		return 1;
+
+	if (unlikely(bp->b_length != map->bm_len)) {
+		/*
+		 * found a block number match. If the range doesn't
+		 * match, the only way this is allowed is if the buffer
+		 * in the cache is stale and the transaction that made
+		 * it stale has not yet committed. i.e. we are
+		 * reallocating a busy extent. Skip this buffer and
+		 * continue searching for an exact match.
+		 */
+		ASSERT(bp->b_flags & XBF_STALE);
+		return 1;
+	}
+	return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+	.min_size		= 32,	/* empty AGs have minimal footprint */
+	.nelem_hint		= 16,
+	.key_len		= sizeof(xfs_daddr_t),
+	.key_offset		= offsetof(struct xfs_buf, b_bn),
+	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+	struct xfs_perag	*pag)
+{
+	spin_lock_init(&pag->pag_buf_lock);
+	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+	struct xfs_perag	*pag)
+{
+	rhashtable_destroy(&pag->pag_buf_hash);
+}
 
 
 /*
 /*
  *	Look up, and creates if absent, a lockable buffer for
  *	Look up, and creates if absent, a lockable buffer for
@@ -488,27 +543,24 @@ _xfs_buf_find(
 	xfs_buf_t		*new_bp)
 	xfs_buf_t		*new_bp)
 {
 {
 	struct xfs_perag	*pag;
 	struct xfs_perag	*pag;
-	struct rb_node		**rbp;
-	struct rb_node		*parent;
 	xfs_buf_t		*bp;
 	xfs_buf_t		*bp;
-	xfs_daddr_t		blkno = map[0].bm_bn;
+	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
 	xfs_daddr_t		eofs;
 	xfs_daddr_t		eofs;
-	int			numblks = 0;
 	int			i;
 	int			i;
 
 
 	for (i = 0; i < nmaps; i++)
 	for (i = 0; i < nmaps; i++)
-		numblks += map[i].bm_len;
+		cmap.bm_len += map[i].bm_len;
 
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
-	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
 
 
 	/*
 	/*
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * have to check that the buffer falls within the filesystem bounds.
 	 * have to check that the buffer falls within the filesystem bounds.
 	 */
 	 */
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-	if (blkno < 0 || blkno >= eofs) {
+	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
 		/*
 		/*
 		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
 		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
 		 * but none of the higher level infrastructure supports
 		 * but none of the higher level infrastructure supports
@@ -516,53 +568,29 @@ _xfs_buf_find(
 		 */
 		 */
 		xfs_alert(btp->bt_mount,
 		xfs_alert(btp->bt_mount,
 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
-			  __func__, blkno, eofs);
+			  __func__, cmap.bm_bn, eofs);
 		WARN_ON(1);
 		WARN_ON(1);
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	/* get tree root */
 	pag = xfs_perag_get(btp->bt_mount,
 	pag = xfs_perag_get(btp->bt_mount,
-				xfs_daddr_to_agno(btp->bt_mount, blkno));
+			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
 
 
-	/* walk tree */
 	spin_lock(&pag->pag_buf_lock);
 	spin_lock(&pag->pag_buf_lock);
-	rbp = &pag->pag_buf_tree.rb_node;
-	parent = NULL;
-	bp = NULL;
-	while (*rbp) {
-		parent = *rbp;
-		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
-		if (blkno < bp->b_bn)
-			rbp = &(*rbp)->rb_left;
-		else if (blkno > bp->b_bn)
-			rbp = &(*rbp)->rb_right;
-		else {
-			/*
-			 * found a block number match. If the range doesn't
-			 * match, the only way this is allowed is if the buffer
-			 * in the cache is stale and the transaction that made
-			 * it stale has not yet committed. i.e. we are
-			 * reallocating a busy extent. Skip this buffer and
-			 * continue searching to the right for an exact match.
-			 */
-			if (bp->b_length != numblks) {
-				ASSERT(bp->b_flags & XBF_STALE);
-				rbp = &(*rbp)->rb_right;
-				continue;
-			}
-			atomic_inc(&bp->b_hold);
-			goto found;
-		}
+	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+				    xfs_buf_hash_params);
+	if (bp) {
+		atomic_inc(&bp->b_hold);
+		goto found;
 	}
 	}
 
 
 	/* No match found */
 	/* No match found */
 	if (new_bp) {
 	if (new_bp) {
-		rb_link_node(&new_bp->b_rbnode, parent, rbp);
-		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 		/* the buffer keeps the perag reference until it is freed */
 		/* the buffer keeps the perag reference until it is freed */
 		new_bp->b_pag = pag;
 		new_bp->b_pag = pag;
+		rhashtable_insert_fast(&pag->pag_buf_hash,
+				       &new_bp->b_rhash_head,
+				       xfs_buf_hash_params);
 		spin_unlock(&pag->pag_buf_lock);
 		spin_unlock(&pag->pag_buf_lock);
 	} else {
 	} else {
 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@@ -930,7 +958,6 @@ xfs_buf_rele(
 
 
 	if (!pag) {
 	if (!pag) {
 		ASSERT(list_empty(&bp->b_lru));
 		ASSERT(list_empty(&bp->b_lru));
-		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold)) {
 		if (atomic_dec_and_test(&bp->b_hold)) {
 			xfs_buf_ioacct_dec(bp);
 			xfs_buf_ioacct_dec(bp);
 			xfs_buf_free(bp);
 			xfs_buf_free(bp);
@@ -938,8 +965,6 @@ xfs_buf_rele(
 		return;
 		return;
 	}
 	}
 
 
-	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 
 
 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@@ -983,7 +1008,8 @@ xfs_buf_rele(
 		}
 		}
 
 
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+				       xfs_buf_hash_params);
 		spin_unlock(&pag->pag_buf_lock);
 		spin_unlock(&pag->pag_buf_lock);
 		xfs_perag_put(pag);
 		xfs_perag_put(pag);
 		freebuf = true;
 		freebuf = true;
@@ -1711,8 +1737,7 @@ xfs_free_buftarg(
 	percpu_counter_destroy(&btp->bt_io_count);
 	percpu_counter_destroy(&btp->bt_io_count);
 	list_lru_destroy(&btp->bt_lru);
 	list_lru_destroy(&btp->bt_lru);
 
 
-	if (mp->m_flags & XFS_MOUNT_BARRIER)
-		xfs_blkdev_issue_flush(btp);
+	xfs_blkdev_issue_flush(btp);
 
 
 	kmem_free(btp);
 	kmem_free(btp);
 }
 }

+ 2 - 1
fs/xfs/xfs_buf.h

@@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_READ,		"READ" }, \
 	{ XBF_READ,		"READ" }, \
 	{ XBF_WRITE,		"WRITE" }, \
 	{ XBF_WRITE,		"WRITE" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
+	{ XBF_NO_IOACCT,	"NO_IOACCT" }, \
 	{ XBF_ASYNC,		"ASYNC" }, \
 	{ XBF_ASYNC,		"ASYNC" }, \
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_STALE,		"STALE" }, \
 	{ XBF_STALE,		"STALE" }, \
@@ -150,7 +151,7 @@ typedef struct xfs_buf {
 	 * which is the only bit that is touched if we hit the semaphore
 	 * which is the only bit that is touched if we hit the semaphore
 	 * fast-path on locking.
 	 * fast-path on locking.
 	 */
 	 */
-	struct rb_node		b_rbnode;	/* rbtree node */
+	struct rhash_head	b_rhash_head;	/* pag buffer hash node */
 	xfs_daddr_t		b_bn;		/* block number of buffer */
 	xfs_daddr_t		b_bn;		/* block number of buffer */
 	int			b_length;	/* size of buffer in BBs */
 	int			b_length;	/* size of buffer in BBs */
 	atomic_t		b_hold;		/* reference count */
 	atomic_t		b_hold;		/* reference count */

+ 0 - 2
fs/xfs/xfs_dir2_readdir.c

@@ -677,7 +677,6 @@ xfs_readdir(
 	args.dp = dp;
 	args.dp = dp;
 	args.geo = dp->i_mount->m_dir_geo;
 	args.geo = dp->i_mount->m_dir_geo;
 
 
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_getdents(&args, ctx);
 		rval = xfs_dir2_sf_getdents(&args, ctx);
 	else if ((rval = xfs_dir2_isblock(&args, &v)))
 	else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -686,7 +685,6 @@ xfs_readdir(
 		rval = xfs_dir2_block_getdents(&args, ctx);
 		rval = xfs_dir2_block_getdents(&args, ctx);
 	else
 	else
 		rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
 		rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 
 
 	return rval;
 	return rval;
 }
 }

+ 94 - 170
fs/xfs/xfs_file.c

@@ -47,40 +47,6 @@
 
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 static const struct vm_operations_struct xfs_file_vm_ops;
 
 
-/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	if (type & XFS_IOLOCK_EXCL)
-		inode_lock(VFS_I(ip));
-	xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	xfs_iunlock(ip, type);
-	if (type & XFS_IOLOCK_EXCL)
-		inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	xfs_ilock_demote(ip, type);
-	if (type & XFS_IOLOCK_EXCL)
-		inode_unlock(VFS_I(ip));
-}
-
 /*
 /*
  * Clear the specified ranges to zero through either the pagecache or DAX.
  * Clear the specified ranges to zero through either the pagecache or DAX.
  * Holes and unwritten extents will be left as-is as they already are zeroed.
  * Holes and unwritten extents will be left as-is as they already are zeroed.
@@ -183,19 +149,16 @@ xfs_file_fsync(
 
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
 
-	if (mp->m_flags & XFS_MOUNT_BARRIER) {
-		/*
-		 * If we have an RT and/or log subvolume we need to make sure
-		 * to flush the write cache the device used for file data
-		 * first.  This is to ensure newly written file data make
-		 * it to disk before logging the new inode size in case of
-		 * an extending write.
-		 */
-		if (XFS_IS_REALTIME_INODE(ip))
-			xfs_blkdev_issue_flush(mp->m_rtdev_targp);
-		else if (mp->m_logdev_targp != mp->m_ddev_targp)
-			xfs_blkdev_issue_flush(mp->m_ddev_targp);
-	}
+	/*
+	 * If we have an RT and/or log subvolume we need to make sure to flush
+	 * the write cache the device used for file data first.  This is to
+	 * ensure newly written file data make it to disk before logging the new
+	 * inode size in case of an extending write.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+	else if (mp->m_logdev_targp != mp->m_ddev_targp)
+		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
 
 	/*
 	/*
 	 * All metadata updates are logged, which means that we just have to
 	 * All metadata updates are logged, which means that we just have to
@@ -230,10 +193,8 @@ xfs_file_fsync(
 	 * an already allocated file and thus do not have any metadata to
 	 * an already allocated file and thus do not have any metadata to
 	 * commit.
 	 * commit.
 	 */
 	 */
-	if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
-	    mp->m_logdev_targp == mp->m_ddev_targp &&
-	    !XFS_IS_REALTIME_INODE(ip) &&
-	    !log_flushed)
+	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp)
 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
 
 	return error;
 	return error;
@@ -244,62 +205,21 @@ xfs_file_dio_aio_read(
 	struct kiocb		*iocb,
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 	struct iov_iter		*to)
 {
 {
-	struct address_space	*mapping = iocb->ki_filp->f_mapping;
-	struct inode		*inode = mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	loff_t			isize = i_size_read(inode);
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
 	size_t			count = iov_iter_count(to);
 	size_t			count = iov_iter_count(to);
-	loff_t			end = iocb->ki_pos + count - 1;
-	struct iov_iter		data;
-	struct xfs_buftarg	*target;
-	ssize_t			ret = 0;
+	ssize_t			ret;
 
 
 	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
 	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
 
 
 	if (!count)
 	if (!count)
 		return 0; /* skip atime */
 		return 0; /* skip atime */
 
 
-	if (XFS_IS_REALTIME_INODE(ip))
-		target = ip->i_mount->m_rtdev_targp;
-	else
-		target = ip->i_mount->m_ddev_targp;
-
-	/* DIO must be aligned to device logical sector size */
-	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
-		if (iocb->ki_pos == isize)
-			return 0;
-		return -EINVAL;
-	}
-
 	file_accessed(iocb->ki_filp);
 	file_accessed(iocb->ki_filp);
 
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	if (mapping->nrpages) {
-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
-		if (ret)
-			goto out_unlock;
-
-		/*
-		 * Invalidate whole pages. This can return an error if we fail
-		 * to invalidate a page, but this should never happen on XFS.
-		 * Warn if it does fail.
-		 */
-		ret = invalidate_inode_pages2_range(mapping,
-				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-		ret = 0;
-	}
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 
-	data = *to;
-	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-			xfs_get_blocks_direct, NULL, NULL, 0);
-	if (ret >= 0) {
-		iocb->ki_pos += ret;
-		iov_iter_advance(to, ret);
-	}
-
-out_unlock:
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -317,9 +237,9 @@ xfs_file_dax_read(
 	if (!count)
 	if (!count)
 		return 0; /* skip atime */
 		return 0; /* skip atime */
 
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
 	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 
 	file_accessed(iocb->ki_filp);
 	file_accessed(iocb->ki_filp);
 	return ret;
 	return ret;
@@ -335,9 +255,9 @@ xfs_file_buffered_aio_read(
 
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = generic_file_read_iter(iocb, to);
 	ret = generic_file_read_iter(iocb, to);
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -418,15 +338,18 @@ restart:
 	if (error <= 0)
 	if (error <= 0)
 		return error;
 		return error;
 
 
-	error = xfs_break_layouts(inode, iolock, true);
+	error = xfs_break_layouts(inode, iolock);
 	if (error)
 	if (error)
 		return error;
 		return error;
 
 
-	/* For changing security info in file_remove_privs() we need i_mutex */
+	/*
+	 * For changing security info in file_remove_privs() we need i_rwsem
+	 * exclusively.
+	 */
 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
-		xfs_rw_iunlock(ip, *iolock);
+		xfs_iunlock(ip, *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
 		*iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, *iolock);
+		xfs_ilock(ip, *iolock);
 		goto restart;
 		goto restart;
 	}
 	}
 	/*
 	/*
@@ -451,9 +374,9 @@ restart:
 		spin_unlock(&ip->i_flags_lock);
 		spin_unlock(&ip->i_flags_lock);
 		if (!drained_dio) {
 		if (!drained_dio) {
 			if (*iolock == XFS_IOLOCK_SHARED) {
 			if (*iolock == XFS_IOLOCK_SHARED) {
-				xfs_rw_iunlock(ip, *iolock);
+				xfs_iunlock(ip, *iolock);
 				*iolock = XFS_IOLOCK_EXCL;
 				*iolock = XFS_IOLOCK_EXCL;
-				xfs_rw_ilock(ip, *iolock);
+				xfs_ilock(ip, *iolock);
 				iov_iter_reexpand(from, count);
 				iov_iter_reexpand(from, count);
 			}
 			}
 			/*
 			/*
@@ -496,6 +419,58 @@ restart:
 	return 0;
 	return 0;
 }
 }
 
 
+static int
+xfs_dio_write_end_io(
+	struct kiocb		*iocb,
+	ssize_t			size,
+	unsigned		flags)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	loff_t			offset = iocb->ki_pos;
+	bool			update_size = false;
+	int			error = 0;
+
+	trace_xfs_end_io_direct_write(ip, offset, size);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	if (size <= 0)
+		return size;
+
+	/*
+	 * We need to update the in-core inode size here so that we don't end up
+	 * with the on-disk inode size being outside the in-core inode size. We
+	 * have no other method of updating EOF for AIO, so always do it here
+	 * if necessary.
+	 *
+	 * We need to lock the test/set EOF update as we can be racing with
+	 * other IO completions here to update the EOF. Failing to serialise
+	 * here can result in EOF moving backwards and Bad Things Happen when
+	 * that occurs.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (offset + size > i_size_read(inode)) {
+		i_size_write(inode, offset + size);
+		update_size = true;
+	}
+	spin_unlock(&ip->i_flags_lock);
+
+	if (flags & IOMAP_DIO_COW) {
+		error = xfs_reflink_end_cow(ip, offset, size);
+		if (error)
+			return error;
+	}
+
+	if (flags & IOMAP_DIO_UNWRITTEN)
+		error = xfs_iomap_write_unwritten(ip, offset, size);
+	else if (update_size)
+		error = xfs_setfilesize(ip, offset, size);
+
+	return error;
+}
+
 /*
 /*
  * xfs_file_dio_aio_write - handle direct IO writes
  * xfs_file_dio_aio_write - handle direct IO writes
  *
  *
@@ -535,9 +510,7 @@ xfs_file_dio_aio_write(
 	int			unaligned_io = 0;
 	int			unaligned_io = 0;
 	int			iolock;
 	int			iolock;
 	size_t			count = iov_iter_count(from);
 	size_t			count = iov_iter_count(from);
-	loff_t			end;
-	struct iov_iter		data;
-	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
+	struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
 
 	/* DIO must be aligned to device logical sector size */
 	/* DIO must be aligned to device logical sector size */
@@ -559,29 +532,12 @@ xfs_file_dio_aio_write(
 		iolock = XFS_IOLOCK_SHARED;
 		iolock = XFS_IOLOCK_SHARED;
 	}
 	}
 
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 	if (ret)
 		goto out;
 		goto out;
 	count = iov_iter_count(from);
 	count = iov_iter_count(from);
-	end = iocb->ki_pos + count - 1;
-
-	if (mapping->nrpages) {
-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
-		if (ret)
-			goto out;
-
-		/*
-		 * Invalidate whole pages. This can return an error if we fail
-		 * to invalidate a page, but this should never happen on XFS.
-		 * Warn if it does fail.
-		 */
-		ret = invalidate_inode_pages2_range(mapping,
-				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-		ret = 0;
-	}
 
 
 	/*
 	/*
 	 * If we are doing unaligned IO, wait for all other IO to drain,
 	 * If we are doing unaligned IO, wait for all other IO to drain,
@@ -591,7 +547,7 @@ xfs_file_dio_aio_write(
 	if (unaligned_io)
 	if (unaligned_io)
 		inode_dio_wait(inode);
 		inode_dio_wait(inode);
 	else if (iolock == XFS_IOLOCK_EXCL) {
 	else if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 		iolock = XFS_IOLOCK_SHARED;
 	}
 	}
 
 
@@ -604,24 +560,9 @@ xfs_file_dio_aio_write(
 			goto out;
 			goto out;
 	}
 	}
 
 
-	data = *from;
-	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-			xfs_get_blocks_direct, xfs_end_io_direct_write,
-			NULL, DIO_ASYNC_EXTEND);
-
-	/* see generic_file_direct_write() for why this is necessary */
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      iocb->ki_pos >> PAGE_SHIFT,
-					      end >> PAGE_SHIFT);
-	}
-
-	if (ret > 0) {
-		iocb->ki_pos += ret;
-		iov_iter_advance(from, ret);
-	}
+	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
 out:
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 
 
 	/*
 	/*
 	 * No fallback to buffered IO on errors for XFS, direct IO will either
 	 * No fallback to buffered IO on errors for XFS, direct IO will either
@@ -643,7 +584,7 @@ xfs_file_dax_write(
 	size_t			count;
 	size_t			count;
 	loff_t			pos;
 	loff_t			pos;
 
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 	if (ret)
 		goto out;
 		goto out;
@@ -652,15 +593,13 @@ xfs_file_dax_write(
 	count = iov_iter_count(from);
 	count = iov_iter_count(from);
 
 
 	trace_xfs_file_dax_write(ip, count, pos);
 	trace_xfs_file_dax_write(ip, count, pos);
-
 	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
 	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
 		i_size_write(inode, iocb->ki_pos);
 		error = xfs_setfilesize(ip, pos, ret);
 		error = xfs_setfilesize(ip, pos, ret);
 	}
 	}
-
 out:
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 	return error ? error : ret;
 	return error ? error : ret;
 }
 }
 
 
@@ -677,7 +616,7 @@ xfs_file_buffered_aio_write(
 	int			enospc = 0;
 	int			enospc = 0;
 	int			iolock = XFS_IOLOCK_EXCL;
 	int			iolock = XFS_IOLOCK_EXCL;
 
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 	if (ret)
@@ -721,7 +660,7 @@ write_retry:
 
 
 	current->backing_dev_info = NULL;
 	current->backing_dev_info = NULL;
 out:
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -797,7 +736,7 @@ xfs_file_fallocate(
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
 
 
 	xfs_ilock(ip, iolock);
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock, false);
+	error = xfs_break_layouts(inode, &iolock);
 	if (error)
 	if (error)
 		goto out_unlock;
 		goto out_unlock;
 
 
@@ -939,7 +878,6 @@ xfs_file_clone_range(
 				     len, false);
 				     len, false);
 }
 }
 
 
-#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
 STATIC ssize_t
 STATIC ssize_t
 xfs_file_dedupe_range(
 xfs_file_dedupe_range(
 	struct file	*src_file,
 	struct file	*src_file,
@@ -950,14 +888,6 @@ xfs_file_dedupe_range(
 {
 {
 	int		error;
 	int		error;
 
 
-	/*
-	 * Limit the total length we will dedupe for each operation.
-	 * This is intended to bound the total time spent in this
-	 * ioctl to something sane.
-	 */
-	if (len > XFS_MAX_DEDUPE_LEN)
-		len = XFS_MAX_DEDUPE_LEN;
-
 	error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
 	error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
 				     len, true);
 				     len, true);
 	if (error)
 	if (error)
@@ -1501,15 +1431,9 @@ xfs_filemap_fault(
 		return xfs_filemap_page_mkwrite(vma, vmf);
 		return xfs_filemap_page_mkwrite(vma, vmf);
 
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	if (IS_DAX(inode)) {
-		/*
-		 * we do not want to trigger unwritten extent conversion on read
-		 * faults - that is unnecessary overhead and would also require
-		 * changes to xfs_get_blocks_direct() to map unwritten extent
-		 * ioend for conversion on read-only mappings.
-		 */
+	if (IS_DAX(inode))
 		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
 		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
-	} else
+	else
 		ret = filemap_fault(vma, vmf);
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 

+ 22 - 18
fs/xfs/xfs_icache.c

@@ -70,8 +70,6 @@ xfs_inode_alloc(
 	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 	ASSERT(ip->i_ino == 0);
 
 
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
 	/* initialise the xfs inode */
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
 	ip->i_ino = ino;
 	ip->i_mount = mp;
 	ip->i_mount = mp;
@@ -123,7 +121,6 @@ __xfs_inode_free(
 {
 {
 	/* asserts to verify all state is correct here */
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!xfs_isiflocked(ip));
 	XFS_STATS_DEC(ip->i_mount, vn_active);
 	XFS_STATS_DEC(ip->i_mount, vn_active);
 
 
 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -133,6 +130,8 @@ void
 xfs_inode_free(
 xfs_inode_free(
 	struct xfs_inode	*ip)
 	struct xfs_inode	*ip)
 {
 {
+	ASSERT(!xfs_isiflocked(ip));
+
 	/*
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
 	 * Because we use RCU freeing we need to ensure the inode always
 	 * appears to be reclaimed with an invalid inode number when in the
 	 * appears to be reclaimed with an invalid inode number when in the
@@ -393,8 +392,8 @@ xfs_iget_cache_hit(
 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		inode->i_state = I_NEW;
 		inode->i_state = I_NEW;
 
 
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+		init_rwsem(&inode->i_rwsem);
 
 
 		spin_unlock(&ip->i_flags_lock);
 		spin_unlock(&ip->i_flags_lock);
 		spin_unlock(&pag->pag_ici_lock);
 		spin_unlock(&pag->pag_ici_lock);
@@ -981,6 +980,7 @@ restart:
 
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		xfs_iunpin_wait(ip);
 		xfs_iunpin_wait(ip);
+		/* xfs_iflush_abort() drops the flush lock */
 		xfs_iflush_abort(ip, false);
 		xfs_iflush_abort(ip, false);
 		goto reclaim;
 		goto reclaim;
 	}
 	}
@@ -989,10 +989,10 @@ restart:
 			goto out_ifunlock;
 			goto out_ifunlock;
 		xfs_iunpin_wait(ip);
 		xfs_iunpin_wait(ip);
 	}
 	}
-	if (xfs_iflags_test(ip, XFS_ISTALE))
-		goto reclaim;
-	if (xfs_inode_clean(ip))
+	if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
 		goto reclaim;
 		goto reclaim;
+	}
 
 
 	/*
 	/*
 	 * Never flush out dirty data during non-blocking reclaim, as it would
 	 * Never flush out dirty data during non-blocking reclaim, as it would
@@ -1030,25 +1030,24 @@ restart:
 		xfs_buf_relse(bp);
 		xfs_buf_relse(bp);
 	}
 	}
 
 
-	xfs_iflock(ip);
 reclaim:
 reclaim:
+	ASSERT(!xfs_isiflocked(ip));
+
 	/*
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always appears
 	 * Because we use RCU freeing we need to ensure the inode always appears
 	 * to be reclaimed with an invalid inode number when in the free state.
 	 * to be reclaimed with an invalid inode number when in the free state.
-	 * We do this as early as possible under the ILOCK and flush lock so
-	 * that xfs_iflush_cluster() can be guaranteed to detect races with us
-	 * here. By doing this, we guarantee that once xfs_iflush_cluster has
-	 * locked both the XFS_ILOCK and the flush lock that it will see either
-	 * a valid, flushable inode that will serialise correctly against the
-	 * locks below, or it will see a clean (and invalid) inode that it can
-	 * skip.
+	 * We do this as early as possible under the ILOCK so that
+	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
+	 * XFS_ILOCK that it will see either a valid, flushable inode that will
+	 * serialise correctly, or it will see a clean (and invalid) inode that
+	 * it can skip.
 	 */
 	 */
 	spin_lock(&ip->i_flags_lock);
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
 	ip->i_flags = XFS_IRECLAIM;
 	ip->i_ino = 0;
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 	spin_unlock(&ip->i_flags_lock);
 
 
-	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 
 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks(
 	struct xfs_eofblocks *eofb = args;
 	struct xfs_eofblocks *eofb = args;
 	bool need_iolock = true;
 	bool need_iolock = true;
 	int match;
 	int match;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 
 
 	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 
 
-	if (!xfs_reflink_has_real_cow_blocks(ip)) {
+	/*
+	 * Just clear the tag if we have an empty cow fork or none at all. It's
+	 * possible the inode was fully unshared since it was originally tagged.
+	 */
+	if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) {
 		trace_xfs_inode_free_cowblocks_invalid(ip);
 		trace_xfs_inode_free_cowblocks_invalid(ip);
 		xfs_inode_clear_cowblocks_tag(ip);
 		xfs_inode_clear_cowblocks_tag(ip);
 		return 0;
 		return 0;

+ 1 - 1
fs/xfs/xfs_icreate_item.c

@@ -133,7 +133,7 @@ xfs_icreate_item_committing(
 /*
 /*
  * This is the ops vector shared by all buf log items.
  * This is the ops vector shared by all buf log items.
  */
  */
-static struct xfs_item_ops xfs_icreate_item_ops = {
+static const struct xfs_item_ops xfs_icreate_item_ops = {
 	.iop_size	= xfs_icreate_item_size,
 	.iop_size	= xfs_icreate_item_size,
 	.iop_format	= xfs_icreate_item_format,
 	.iop_format	= xfs_icreate_item_format,
 	.iop_pin	= xfs_icreate_item_pin,
 	.iop_pin	= xfs_icreate_item_pin,

+ 33 - 51
fs/xfs/xfs_inode.c

@@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared(
 }
 }
 
 
 /*
 /*
- * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * the i_lock.  This routine allows various combinations of the locks to be
- * obtained.
+ * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
+ * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
+ * various combinations of the locks to be obtained.
  *
  *
  * The 3 locks should always be ordered so that the IO lock is obtained first,
  * The 3 locks should always be ordered so that the IO lock is obtained first,
  * the mmap lock second and the ilock last in order to prevent deadlock.
  * the mmap lock second and the ilock last in order to prevent deadlock.
  *
  *
  * Basic locking order:
  * Basic locking order:
  *
  *
- * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
  *
  *
  * mmap_sem locking order:
  * mmap_sem locking order:
  *
  *
- * i_iolock -> page lock -> mmap_sem
+ * i_rwsem -> page lock -> mmap_sem
  * mmap_sem -> i_mmap_lock -> page_lock
  * mmap_sem -> i_mmap_lock -> page_lock
  *
  *
  * The difference in mmap_sem locking order mean that we cannot hold the
  * The difference in mmap_sem locking order mean that we cannot hold the
  * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
  * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
  * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
  * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
  * in get_user_pages() to map the user pages into the kernel address space for
  * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
  * page faults already hold the mmap_sem.
  * page faults already hold the mmap_sem.
  *
  *
  * Hence to serialise fully against both syscall and mmap based IO, we need to
  * Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
  * taken in places where we need to invalidate the page cache in a race
  * taken in places where we need to invalidate the page cache in a race
  * free manner (e.g. truncate, hole punch and other extent manipulation
  * free manner (e.g. truncate, hole punch and other extent manipulation
  * functions).
  * functions).
@@ -191,10 +191,13 @@ xfs_ilock(
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
 
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		down_write_nested(&VFS_I(ip)->i_rwsem,
+				  XFS_IOLOCK_DEP(lock_flags));
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		down_read_nested(&VFS_I(ip)->i_rwsem,
+				 XFS_IOLOCK_DEP(lock_flags));
+	}
 
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
 
 	if (lock_flags & XFS_IOLOCK_EXCL) {
 	if (lock_flags & XFS_IOLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_iolock))
+		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
 			goto out;
 			goto out;
 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_iolock))
+		if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
 			goto out;
 			goto out;
 	}
 	}
 
 
@@ -271,9 +274,9 @@ out_undo_mmaplock:
 		mrunlock_shared(&ip->i_mmaplock);
 		mrunlock_shared(&ip->i_mmaplock);
 out_undo_iolock:
 out_undo_iolock:
 	if (lock_flags & XFS_IOLOCK_EXCL)
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
+		up_write(&VFS_I(ip)->i_rwsem);
 	else if (lock_flags & XFS_IOLOCK_SHARED)
 	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
+		up_read(&VFS_I(ip)->i_rwsem);
 out:
 out:
 	return 0;
 	return 0;
 }
 }
@@ -310,9 +313,9 @@ xfs_iunlock(
 	ASSERT(lock_flags != 0);
 	ASSERT(lock_flags != 0);
 
 
 	if (lock_flags & XFS_IOLOCK_EXCL)
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
+		up_write(&VFS_I(ip)->i_rwsem);
 	else if (lock_flags & XFS_IOLOCK_SHARED)
 	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
+		up_read(&VFS_I(ip)->i_rwsem);
 
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrunlock_excl(&ip->i_mmaplock);
 		mrunlock_excl(&ip->i_mmaplock);
@@ -345,7 +348,7 @@ xfs_ilock_demote(
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrdemote(&ip->i_mmaplock);
 		mrdemote(&ip->i_mmaplock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrdemote(&ip->i_iolock);
+		downgrade_write(&VFS_I(ip)->i_rwsem);
 
 
 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
 }
@@ -370,8 +373,9 @@ xfs_isilocked(
 
 
 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 		if (!(lock_flags & XFS_IOLOCK_SHARED))
 		if (!(lock_flags & XFS_IOLOCK_SHARED))
-			return !!ip->i_iolock.mr_writer;
-		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+			return !debug_locks ||
+				lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
+		return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
 	}
 	}
 
 
 	ASSERT(0);
 	ASSERT(0);
@@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 
 
 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 		ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
 		ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
-		ASSERT(xfs_lockdep_subclass_ok(subclass +
-						XFS_IOLOCK_PARENT_VAL));
 		class += subclass << XFS_IOLOCK_SHIFT;
 		class += subclass << XFS_IOLOCK_SHIFT;
-		if (lock_mode & XFS_IOLOCK_PARENT)
-			class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
 	}
 	}
 
 
 	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
@@ -477,8 +477,6 @@ xfs_lock_inodes(
 			    XFS_ILOCK_EXCL));
 			    XFS_ILOCK_EXCL));
 	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 			      XFS_ILOCK_SHARED)));
 			      XFS_ILOCK_SHARED)));
-	ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
-		inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
 	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 		inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 		inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
@@ -581,10 +579,8 @@ xfs_lock_two_inodes(
 	int			attempts = 0;
 	int			attempts = 0;
 	xfs_log_item_t		*lp;
 	xfs_log_item_t		*lp;
 
 
-	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
-		ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
-		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-	} else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
+	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
 		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 
 
 	ASSERT(ip0->i_ino != ip1->i_ino);
 	ASSERT(ip0->i_ino != ip1->i_ino);
@@ -715,7 +711,6 @@ xfs_lookup(
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
 		return -EIO;
 
 
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	if (error)
 	if (error)
 		goto out_unlock;
 		goto out_unlock;
@@ -724,14 +719,12 @@ xfs_lookup(
 	if (error)
 	if (error)
 		goto out_free_name;
 		goto out_free_name;
 
 
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 	return 0;
 	return 0;
 
 
 out_free_name:
 out_free_name:
 	if (ci_name)
 	if (ci_name)
 		kmem_free(ci_name->name);
 		kmem_free(ci_name->name);
 out_unlock:
 out_unlock:
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 	*ipp = NULL;
 	*ipp = NULL;
 	return error;
 	return error;
 }
 }
@@ -1215,8 +1208,7 @@ xfs_create(
 	if (error)
 	if (error)
 		goto out_release_inode;
 		goto out_release_inode;
 
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
-		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
 	unlock_dp_on_error = true;
 
 
 	xfs_defer_init(&dfops, &first_block);
 	xfs_defer_init(&dfops, &first_block);
@@ -1252,7 +1244,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 * error path.
 	 */
 	 */
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = false;
 	unlock_dp_on_error = false;
 
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1325,7 +1317,7 @@ xfs_create(
 	xfs_qm_dqrele(pdqp);
 	xfs_qm_dqrele(pdqp);
 
 
 	if (unlock_dp_on_error)
 	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 	return error;
 }
 }
 
 
@@ -1466,11 +1458,10 @@ xfs_link(
 	if (error)
 	if (error)
 		goto std_return;
 		goto std_return;
 
 
-	xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
 
 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
 
 	/*
 	/*
 	 * If we are using project inheritance, we only allow hard link
 	 * If we are using project inheritance, we only allow hard link
@@ -2041,7 +2032,6 @@ xfs_iunlink(
 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
 		(sizeof(xfs_agino_t) * bucket_index);
 		(sizeof(xfs_agino_t) * bucket_index);
-	xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
 	xfs_trans_log_buf(tp, agibp, offset,
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 			  (offset + sizeof(xfs_agino_t) - 1));
 	return 0;
 	return 0;
@@ -2133,7 +2123,6 @@ xfs_iunlink_remove(
 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
 		offset = offsetof(xfs_agi_t, agi_unlinked) +
 		offset = offsetof(xfs_agi_t, agi_unlinked) +
 			(sizeof(xfs_agino_t) * bucket_index);
 			(sizeof(xfs_agino_t) * bucket_index);
-		xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
 		xfs_trans_log_buf(tp, agibp, offset,
 		xfs_trans_log_buf(tp, agibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
 				  (offset + sizeof(xfs_agino_t) - 1));
 	} else {
 	} else {
@@ -2579,10 +2568,9 @@ xfs_remove(
 		goto std_return;
 		goto std_return;
 	}
 	}
 
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
 
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 
 	/*
 	/*
@@ -2963,12 +2951,6 @@ xfs_rename(
 	 * whether the target directory is the same as the source
 	 * whether the target directory is the same as the source
 	 * directory, we can lock from 2 to 4 inodes.
 	 * directory, we can lock from 2 to 4 inodes.
 	 */
 	 */
-	if (!new_parent)
-		xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-	else
-		xfs_lock_two_inodes(src_dp, target_dp,
-				    XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
 	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 
 
 	/*
 	/*
@@ -2976,9 +2958,9 @@ xfs_rename(
 	 * we can rely on either trans_commit or trans_cancel to unlock
 	 * we can rely on either trans_commit or trans_cancel to unlock
 	 * them.
 	 * them.
 	 */
 	 */
-	xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent)
 	if (new_parent)
-		xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
 	if (target_ip)
 	if (target_ip)
 		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);

+ 8 - 10
fs/xfs/xfs_inode.h

@@ -56,7 +56,6 @@ typedef struct xfs_inode {
 	/* Transaction and locking information. */
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_lock;		/* inode lock */
-	mrlock_t		i_iolock;	/* inode IO lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
 	atomic_t		i_pincount;	/* inode pin count */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
@@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
  * Synchronize processes attempting to flush the in-core inode back to disk.
  * Synchronize processes attempting to flush the in-core inode back to disk.
  */
  */
 
 
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+	return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
 extern void __xfs_iflock(struct xfs_inode *ip);
 extern void __xfs_iflock(struct xfs_inode *ip);
 
 
 static inline int xfs_iflock_nowait(struct xfs_inode *ip)
 static inline int xfs_iflock_nowait(struct xfs_inode *ip)
@@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip)
 
 
 static inline void xfs_ifunlock(struct xfs_inode *ip)
 static inline void xfs_ifunlock(struct xfs_inode *ip)
 {
 {
+	ASSERT(xfs_isiflocked(ip));
 	xfs_iflags_clear(ip, XFS_IFLOCK);
 	xfs_iflags_clear(ip, XFS_IFLOCK);
 	smp_mb();
 	smp_mb();
 	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
 	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
 }
 }
 
 
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
-	return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
 /*
 /*
  * Flags for inode locking.
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
@@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * IOLOCK values
  * IOLOCK values
  *
  *
  * 0-3		subclass value
  * 0-3		subclass value
- * 4-7		PARENT subclass values
+ * 4-7		unused
  *
  *
  * MMAPLOCK values
  * MMAPLOCK values
  *
  *
@@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * 
  * 
  */
  */
 #define XFS_IOLOCK_SHIFT		16
 #define XFS_IOLOCK_SHIFT		16
-#define XFS_IOLOCK_PARENT_VAL		4
-#define XFS_IOLOCK_MAX_SUBCLASS		(XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_MAX_SUBCLASS		3
 #define XFS_IOLOCK_DEP_MASK		0x000f0000
 #define XFS_IOLOCK_DEP_MASK		0x000f0000
-#define	XFS_IOLOCK_PARENT		(XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
 
 
 #define XFS_MMAPLOCK_SHIFT		20
 #define XFS_MMAPLOCK_SHIFT		20
 #define XFS_MMAPLOCK_NUMORDER		0
 #define XFS_MMAPLOCK_NUMORDER		0

+ 2 - 2
fs/xfs/xfs_inode_item.c

@@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork(
 			struct xfs_bmbt_rec *p;
 			struct xfs_bmbt_rec *p;
 
 
 			ASSERT(ip->i_df.if_u1.if_extents != NULL);
 			ASSERT(ip->i_df.if_u1.if_extents != NULL);
-			ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
+			ASSERT(xfs_iext_count(&ip->i_df) > 0);
 
 
 			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
 			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
 			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
 			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
@@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork(
 		    ip->i_afp->if_bytes > 0) {
 		    ip->i_afp->if_bytes > 0) {
 			struct xfs_bmbt_rec *p;
 			struct xfs_bmbt_rec *p;
 
 
-			ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+			ASSERT(xfs_iext_count(ip->i_afp) ==
 				ip->i_d.di_anextents);
 				ip->i_d.di_anextents);
 			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
 			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
 
 

+ 3 - 5
fs/xfs/xfs_ioctl.c

@@ -639,7 +639,7 @@ xfs_ioc_space(
 		return error;
 		return error;
 
 
 	xfs_ilock(ip, iolock);
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock, false);
+	error = xfs_break_layouts(inode, &iolock);
 	if (error)
 	if (error)
 		goto out_unlock;
 		goto out_unlock;
 
 
@@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr(
 	if (attr) {
 	if (attr) {
 		if (ip->i_afp) {
 		if (ip->i_afp) {
 			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
 			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
-				fa.fsx_nextents = ip->i_afp->if_bytes /
-							sizeof(xfs_bmbt_rec_t);
+				fa.fsx_nextents = xfs_iext_count(ip->i_afp);
 			else
 			else
 				fa.fsx_nextents = ip->i_d.di_anextents;
 				fa.fsx_nextents = ip->i_d.di_anextents;
 		} else
 		} else
 			fa.fsx_nextents = 0;
 			fa.fsx_nextents = 0;
 	} else {
 	} else {
 		if (ip->i_df.if_flags & XFS_IFEXTENTS)
 		if (ip->i_df.if_flags & XFS_IFEXTENTS)
-			fa.fsx_nextents = ip->i_df.if_bytes /
-						sizeof(xfs_bmbt_rec_t);
+			fa.fsx_nextents = xfs_iext_count(&ip->i_df);
 		else
 		else
 			fa.fsx_nextents = ip->i_d.di_nextents;
 			fa.fsx_nextents = ip->i_d.di_nextents;
 	}
 	}

+ 64 - 40
fs/xfs/xfs_iomap.c

@@ -395,11 +395,12 @@ xfs_iomap_prealloc_size(
 	struct xfs_inode	*ip,
 	struct xfs_inode	*ip,
 	loff_t			offset,
 	loff_t			offset,
 	loff_t			count,
 	loff_t			count,
-	xfs_extnum_t		idx,
-	struct xfs_bmbt_irec	*prev)
+	xfs_extnum_t		idx)
 {
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	struct xfs_bmbt_irec	prev;
 	int			shift = 0;
 	int			shift = 0;
 	int64_t			freesp;
 	int64_t			freesp;
 	xfs_fsblock_t		qblocks;
 	xfs_fsblock_t		qblocks;
@@ -419,8 +420,8 @@ xfs_iomap_prealloc_size(
 	 */
 	 */
 	if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
 	if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
 	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
 	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
-	    idx == 0 ||
-	    prev->br_startoff + prev->br_blockcount < offset_fsb)
+	    !xfs_iext_get_extent(ifp, idx - 1, &prev) ||
+	    prev.br_startoff + prev.br_blockcount < offset_fsb)
 		return mp->m_writeio_blocks;
 		return mp->m_writeio_blocks;
 
 
 	/*
 	/*
@@ -439,8 +440,8 @@ xfs_iomap_prealloc_size(
 	 * always extends to MAXEXTLEN rather than falling short due to things
 	 * always extends to MAXEXTLEN rather than falling short due to things
 	 * like stripe unit/width alignment of real extents.
 	 * like stripe unit/width alignment of real extents.
 	 */
 	 */
-	if (prev->br_blockcount <= (MAXEXTLEN >> 1))
-		alloc_blocks = prev->br_blockcount << 1;
+	if (prev.br_blockcount <= (MAXEXTLEN >> 1))
+		alloc_blocks = prev.br_blockcount << 1;
 	else
 	else
 		alloc_blocks = XFS_B_TO_FSB(mp, offset);
 		alloc_blocks = XFS_B_TO_FSB(mp, offset);
 	if (!alloc_blocks)
 	if (!alloc_blocks)
@@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay(
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		maxbytes_fsb =
 	xfs_fileoff_t		maxbytes_fsb =
 		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-	xfs_fileoff_t		end_fsb, orig_end_fsb;
+	xfs_fileoff_t		end_fsb;
 	int			error = 0, eof = 0;
 	int			error = 0, eof = 0;
 	struct xfs_bmbt_irec	got;
 	struct xfs_bmbt_irec	got;
-	struct xfs_bmbt_irec	prev;
 	xfs_extnum_t		idx;
 	xfs_extnum_t		idx;
+	xfs_fsblock_t		prealloc_blocks = 0;
 
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 	ASSERT(!xfs_get_extsz_hint(ip));
 	ASSERT(!xfs_get_extsz_hint(ip));
@@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay(
 			goto out_unlock;
 			goto out_unlock;
 	}
 	}
 
 
-	xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
-			&got, &prev);
+	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
 	if (!eof && got.br_startoff <= offset_fsb) {
 	if (!eof && got.br_startoff <= offset_fsb) {
 		if (xfs_is_reflink_inode(ip)) {
 		if (xfs_is_reflink_inode(ip)) {
 			bool		shared;
 			bool		shared;
@@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay(
 	 * the lower level functions are updated.
 	 * the lower level functions are updated.
 	 */
 	 */
 	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
 	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-	end_fsb = orig_end_fsb =
-		min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
 
 	if (eof) {
 	if (eof) {
-		xfs_fsblock_t	prealloc_blocks;
-
-		prealloc_blocks =
-			xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx);
 		if (prealloc_blocks) {
 		if (prealloc_blocks) {
 			xfs_extlen_t	align;
 			xfs_extlen_t	align;
 			xfs_off_t	end_offset;
 			xfs_off_t	end_offset;
+			xfs_fileoff_t	p_end_fsb;
 
 
 			end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
 			end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
-			end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
-				prealloc_blocks;
+			p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+					prealloc_blocks;
 
 
 			align = xfs_eof_alignment(ip, 0);
 			align = xfs_eof_alignment(ip, 0);
 			if (align)
 			if (align)
-				end_fsb = roundup_64(end_fsb, align);
+				p_end_fsb = roundup_64(p_end_fsb, align);
 
 
-			end_fsb = min(end_fsb, maxbytes_fsb);
-			ASSERT(end_fsb > offset_fsb);
+			p_end_fsb = min(p_end_fsb, maxbytes_fsb);
+			ASSERT(p_end_fsb > offset_fsb);
+			prealloc_blocks = p_end_fsb - end_fsb;
 		}
 		}
 	}
 	}
 
 
 retry:
 retry:
 	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
 	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-			end_fsb - offset_fsb, &got,
-			&prev, &idx, eof);
+			end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
 	switch (error) {
 	switch (error) {
 	case 0:
 	case 0:
 		break;
 		break;
@@ -631,8 +628,8 @@ retry:
 	case -EDQUOT:
 	case -EDQUOT:
 		/* retry without any preallocation */
 		/* retry without any preallocation */
 		trace_xfs_delalloc_enospc(ip, offset, count);
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (end_fsb != orig_end_fsb) {
-			end_fsb = orig_end_fsb;
+		if (prealloc_blocks) {
+			prealloc_blocks = 0;
 			goto retry;
 			goto retry;
 		}
 		}
 		/*FALLTHRU*/
 		/*FALLTHRU*/
@@ -640,13 +637,6 @@ retry:
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
 
 
-	/*
-	 * Tag the inode as speculatively preallocated so we can reclaim this
-	 * space on demand, if necessary.
-	 */
-	if (end_fsb != orig_end_fsb)
-		xfs_inode_set_eofblocks_tag(ip);
-
 	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
 	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
 done:
 done:
 	if (isnullstartblock(got.br_startblock))
 	if (isnullstartblock(got.br_startblock))
@@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode,
 		(IS_DAX(inode) && ISUNWRITTEN(imap));
 		(IS_DAX(inode) && ISUNWRITTEN(imap));
 }
 }
 
 
+static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+{
+	/*
+	 * COW writes will allocate delalloc space, so we need to make sure
+	 * to take the lock exclusively here.
+	 */
+	if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
+		return true;
+	if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
+		return true;
+	return false;
+}
+
 static int
 static int
 xfs_file_iomap_begin(
 xfs_file_iomap_begin(
 	struct inode		*inode,
 	struct inode		*inode,
@@ -979,18 +982,14 @@ xfs_file_iomap_begin(
 	if (XFS_FORCED_SHUTDOWN(mp))
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 		return -EIO;
 
 
-	if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
-		   !xfs_get_extsz_hint(ip)) {
+	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
 		/* Reserve delalloc blocks for regular writeback. */
 		/* Reserve delalloc blocks for regular writeback. */
 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
 				iomap);
 				iomap);
 	}
 	}
 
 
-	/*
-	 * COW writes will allocate delalloc space, so we need to make sure
-	 * to take the lock exclusively here.
-	 */
-	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+	if (need_excl_ilock(ip, flags)) {
 		lockmode = XFS_ILOCK_EXCL;
 		lockmode = XFS_ILOCK_EXCL;
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 	} else {
 	} else {
@@ -1003,17 +1002,41 @@ xfs_file_iomap_begin(
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
 
+	if (xfs_is_reflink_inode(ip) &&
+	    (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
+		shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
+		if (shared) {
+			xfs_iunlock(ip, lockmode);
+			goto alloc_done;
+		}
+		ASSERT(!isnullstartblock(imap.br_startblock));
+	}
+
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, 0);
 			       &nimaps, 0);
 	if (error)
 	if (error)
 		goto out_unlock;
 		goto out_unlock;
 
 
-	if (flags & IOMAP_REPORT) {
+	if ((flags & IOMAP_REPORT) ||
+	    (xfs_is_reflink_inode(ip) &&
+	     (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
 		/* Trim the mapping to the nearest shared extent boundary. */
 		/* Trim the mapping to the nearest shared extent boundary. */
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
 				&trimmed);
 				&trimmed);
 		if (error)
 		if (error)
 			goto out_unlock;
 			goto out_unlock;
+
+		/*
+		 * We're here because we're trying to do a directio write to a
+		 * region that isn't aligned to a filesystem block.  If the
+		 * extent is shared, fall back to buffered mode to handle the
+		 * RMW.
+		 */
+		if (!(flags & IOMAP_REPORT) && shared) {
+			trace_xfs_reflink_bounce_dio_write(ip, &imap);
+			error = -EREMCHG;
+			goto out_unlock;
+		}
 	}
 	}
 
 
 	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
 	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
@@ -1048,6 +1071,7 @@ xfs_file_iomap_begin(
 		if (error)
 		if (error)
 			return error;
 			return error;
 
 
+alloc_done:
 		iomap->flags = IOMAP_F_NEW;
 		iomap->flags = IOMAP_F_NEW;
 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 	} else {
 	} else {

+ 6 - 8
fs/xfs/xfs_iops.c

@@ -983,15 +983,13 @@ xfs_vn_setattr(
 		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 		uint			iolock = XFS_IOLOCK_EXCL;
 		uint			iolock = XFS_IOLOCK_EXCL;
 
 
-		xfs_ilock(ip, iolock);
-		error = xfs_break_layouts(d_inode(dentry), &iolock, true);
-		if (!error) {
-			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-			iolock |= XFS_MMAPLOCK_EXCL;
+		error = xfs_break_layouts(d_inode(dentry), &iolock);
+		if (error)
+			return error;
 
 
-			error = xfs_vn_setattr_size(dentry, iattr);
-		}
-		xfs_iunlock(ip, iolock);
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		error = xfs_vn_setattr_size(dentry, iattr);
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 	} else {
 	} else {
 		error = xfs_vn_setattr_nonsize(dentry, iattr);
 		error = xfs_vn_setattr_nonsize(dentry, iattr);
 	}
 	}

+ 1 - 0
fs/xfs/xfs_linux.h

@@ -78,6 +78,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/freezer.h>
 #include <linux/freezer.h>
 #include <linux/list_sort.h>
 #include <linux/list_sort.h>
 #include <linux/ratelimit.h>
 #include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
 
 
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/div64.h>
 #include <asm/div64.h>

+ 17 - 24
fs/xfs/xfs_log.c

@@ -1668,7 +1668,7 @@ xlog_cksum(
 	__uint32_t		crc;
 	__uint32_t		crc;
 
 
 	/* first generate the crc for the record header ... */
 	/* first generate the crc for the record header ... */
-	crc = xfs_start_cksum((char *)rhead,
+	crc = xfs_start_cksum_update((char *)rhead,
 			      sizeof(struct xlog_rec_header),
 			      sizeof(struct xlog_rec_header),
 			      offsetof(struct xlog_rec_header, h_crc));
 			      offsetof(struct xlog_rec_header, h_crc));
 
 
@@ -1862,26 +1862,21 @@ xlog_sync(
 
 
 	bp->b_io_length = BTOBB(count);
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
 	bp->b_fspriv = iclog;
-	bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
-	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
+	bp->b_flags &= ~XBF_FLUSH;
+	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
 
 
-	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
-		bp->b_flags |= XBF_FUA;
-
-		/*
-		 * Flush the data device before flushing the log to make
-		 * sure all meta data written back from the AIL actually made
-		 * it to disk before stamping the new log tail LSN into the
-		 * log buffer.  For an external log we need to issue the
-		 * flush explicitly, and unfortunately synchronously here;
-		 * for an internal log we can simply use the block layer
-		 * state machine for preflushes.
-		 */
-		if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
-			xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
-		else
-			bp->b_flags |= XBF_FLUSH;
-	}
+	/*
+	 * Flush the data device before flushing the log to make sure all meta
+	 * data written back from the AIL actually made it to disk before
+	 * stamping the new log tail LSN into the log buffer.  For an external
+	 * log we need to issue the flush explicitly, and unfortunately
+	 * synchronously here; for an internal log we can simply use the block
+	 * layer state machine for preflushes.
+	 */
+	if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+		xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+	else
+		bp->b_flags |= XBF_FLUSH;
 
 
 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
 	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1906,10 +1901,8 @@ xlog_sync(
 		xfs_buf_associate_memory(bp,
 		xfs_buf_associate_memory(bp,
 				(char *)&iclog->ic_header + count, split);
 				(char *)&iclog->ic_header + count, split);
 		bp->b_fspriv = iclog;
 		bp->b_fspriv = iclog;
-		bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
-		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
-		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
-			bp->b_flags |= XBF_FUA;
+		bp->b_flags &= ~XBF_FLUSH;
+		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
 
 
 		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
 		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);

+ 9 - 7
fs/xfs/xfs_log_recover.c

@@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled(
 	struct xlog		*log,
 	struct xlog		*log,
 	xfs_daddr_t		blkno,
 	xfs_daddr_t		blkno,
 	uint			len,
 	uint			len,
-	ushort			flags)
+	unsigned short			flags)
 {
 {
 	struct list_head	*bucket;
 	struct list_head	*bucket;
 	struct xfs_buf_cancel	*bcp;
 	struct xfs_buf_cancel	*bcp;
@@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled(
 	struct xlog		*log,
 	struct xlog		*log,
 	xfs_daddr_t		blkno,
 	xfs_daddr_t		blkno,
 	uint			len,
 	uint			len,
-	ushort			flags)
+	unsigned short			flags)
 {
 {
 	struct xfs_buf_cancel	*bcp;
 	struct xfs_buf_cancel	*bcp;
 
 
@@ -5113,19 +5113,21 @@ xlog_recover_process(
 	struct list_head	*buffer_list)
 	struct list_head	*buffer_list)
 {
 {
 	int			error;
 	int			error;
+	__le32			old_crc = rhead->h_crc;
 	__le32			crc;
 	__le32			crc;
 
 
+
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
 
 
 	/*
 	/*
 	 * Nothing else to do if this is a CRC verification pass. Just return
 	 * Nothing else to do if this is a CRC verification pass. Just return
 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
-	 * sets h_crc to 0 so we must consider this valid even on v5 supers.
+	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
 	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
 	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
 	 * know precisely what failed.
 	 * know precisely what failed.
 	 */
 	 */
 	if (pass == XLOG_RECOVER_CRCPASS) {
 	if (pass == XLOG_RECOVER_CRCPASS) {
-		if (rhead->h_crc && crc != rhead->h_crc)
+		if (old_crc && crc != old_crc)
 			return -EFSBADCRC;
 			return -EFSBADCRC;
 		return 0;
 		return 0;
 	}
 	}
@@ -5136,11 +5138,11 @@ xlog_recover_process(
 	 * zero CRC check prevents warnings from being emitted when upgrading
 	 * zero CRC check prevents warnings from being emitted when upgrading
 	 * the kernel from one that does not add CRCs by default.
 	 * the kernel from one that does not add CRCs by default.
 	 */
 	 */
-	if (crc != rhead->h_crc) {
-		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+	if (crc != old_crc) {
+		if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
 			xfs_alert(log->l_mp,
 			xfs_alert(log->l_mp,
 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
-					le32_to_cpu(rhead->h_crc),
+					le32_to_cpu(old_crc),
 					le32_to_cpu(crc));
 					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
 			xfs_hex_dump(dp, 32);
 		}
 		}

+ 5 - 2
fs/xfs/xfs_mount.c

@@ -157,6 +157,7 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		xfs_buf_hash_destroy(pag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 	}
 }
 }
@@ -212,8 +213,8 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pag_ici_lock);
 		spin_lock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-		spin_lock_init(&pag->pag_buf_lock);
-		pag->pag_buf_tree = RB_ROOT;
+		if (xfs_buf_hash_init(pag))
+			goto out_unwind;
 
 
 		if (radix_tree_preload(GFP_NOFS))
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
 			goto out_unwind;
@@ -239,9 +240,11 @@ xfs_initialize_perag(
 	return 0;
 	return 0;
 
 
 out_unwind:
 out_unwind:
+	xfs_buf_hash_destroy(pag);
 	kmem_free(pag);
 	kmem_free(pag);
 	for (; index > first_initialised; index--) {
 	for (; index > first_initialised; index--) {
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		xfs_buf_hash_destroy(pag);
 		kmem_free(pag);
 		kmem_free(pag);
 	}
 	}
 	return error;
 	return error;

+ 5 - 2
fs/xfs/xfs_mount.h

@@ -393,8 +393,8 @@ typedef struct xfs_perag {
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
 
 	/* buffer cache index */
 	/* buffer cache index */
-	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
-	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_hash */
+	struct rhashtable pag_buf_hash;
 
 
 	/* for rcu-safe freeing */
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
 	struct rcu_head	rcu_head;
@@ -424,6 +424,9 @@ xfs_perag_resv(
 	}
 	}
 }
 }
 
 
+int xfs_buf_hash_init(xfs_perag_t *pag);
+void xfs_buf_hash_destroy(xfs_perag_t *pag);
+
 extern void	xfs_uuid_table_free(void);
 extern void	xfs_uuid_table_free(void);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);

+ 1 - 6
fs/xfs/xfs_pnfs.c

@@ -32,8 +32,7 @@
 int
 int
 xfs_break_layouts(
 xfs_break_layouts(
 	struct inode		*inode,
 	struct inode		*inode,
-	uint			*iolock,
-	bool			with_imutex)
+	uint			*iolock)
 {
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
 	int			error;
@@ -42,12 +41,8 @@ xfs_break_layouts(
 
 
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
 		xfs_iunlock(ip, *iolock);
-		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
-			inode_unlock(inode);
 		error = break_layout(inode, true);
 		error = break_layout(inode, true);
 		*iolock = XFS_IOLOCK_EXCL;
 		*iolock = XFS_IOLOCK_EXCL;
-		if (with_imutex)
-			inode_lock(inode);
 		xfs_ilock(ip, *iolock);
 		xfs_ilock(ip, *iolock);
 	}
 	}
 
 

+ 2 - 2
fs/xfs/xfs_pnfs.h

@@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
 		struct iattr *iattr);
 		struct iattr *iattr);
 
 
-int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
+int xfs_break_layouts(struct inode *inode, uint *iolock);
 #else
 #else
 static inline int
 static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
+xfs_break_layouts(struct inode *inode, uint *iolock)
 {
 {
 	return 0;
 	return 0;
 }
 }

+ 1 - 1
fs/xfs/xfs_qm.c

@@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks(
 			return error;
 			return error;
 	}
 	}
 	rtblks = 0;
 	rtblks = 0;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	for (idx = 0; idx < nextents; idx++)
 	for (idx = 0; idx < nextents; idx++)
 		rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
 		rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
 	*O_rtblks = (xfs_qcnt_t)rtblks;
 	*O_rtblks = (xfs_qcnt_t)rtblks;

+ 54 - 137
fs/xfs/xfs_reflink.c

@@ -243,12 +243,11 @@ xfs_reflink_reserve_cow(
 	struct xfs_bmbt_irec	*imap,
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared)
 	bool			*shared)
 {
 {
-	struct xfs_bmbt_irec	got, prev;
-	xfs_fileoff_t		end_fsb, orig_end_fsb;
-	int			eof = 0, error = 0;
-	bool			trimmed;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_bmbt_irec	got;
+	int			error = 0;
+	bool			eof = false, trimmed;
 	xfs_extnum_t		idx;
 	xfs_extnum_t		idx;
-	xfs_extlen_t		align;
 
 
 	/*
 	/*
 	 * Search the COW fork extent list first.  This serves two purposes:
 	 * Search the COW fork extent list first.  This serves two purposes:
@@ -258,8 +257,9 @@ xfs_reflink_reserve_cow(
 	 * extent list is generally faster than going out to the shared extent
 	 * extent list is generally faster than going out to the shared extent
 	 * tree.
 	 * tree.
 	 */
 	 */
-	xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
-			&got, &prev);
+
+	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+		eof = true;
 	if (!eof && got.br_startoff <= imap->br_startoff) {
 	if (!eof && got.br_startoff <= imap->br_startoff) {
 		trace_xfs_reflink_cow_found(ip, imap);
 		trace_xfs_reflink_cow_found(ip, imap);
 		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
 		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
@@ -285,33 +285,12 @@ xfs_reflink_reserve_cow(
 	if (error)
 	if (error)
 		return error;
 		return error;
 
 
-	end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
-
-	align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
-	if (align)
-		end_fsb = roundup_64(end_fsb, align);
-
-retry:
 	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
 	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-			end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
-	switch (error) {
-	case 0:
-		break;
-	case -ENOSPC:
-	case -EDQUOT:
-		/* retry without any preallocation */
+			imap->br_blockcount, 0, &got, &idx, eof);
+	if (error == -ENOSPC || error == -EDQUOT)
 		trace_xfs_reflink_cow_enospc(ip, imap);
 		trace_xfs_reflink_cow_enospc(ip, imap);
-		if (end_fsb != orig_end_fsb) {
-			end_fsb = orig_end_fsb;
-			goto retry;
-		}
-		/*FALLTHRU*/
-	default:
+	if (error)
 		return error;
 		return error;
-	}
-
-	if (end_fsb != orig_end_fsb)
-		xfs_inode_set_cowblocks_tag(ip);
 
 
 	trace_xfs_reflink_cow_alloc(ip, &got);
 	trace_xfs_reflink_cow_alloc(ip, &got);
 	return 0;
 	return 0;
@@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range(
 }
 }
 
 
 /*
 /*
- * Find the CoW reservation (and whether or not it needs block allocation)
- * for a given byte offset of a file.
+ * Find the CoW reservation for a given byte offset of a file.
  */
  */
 bool
 bool
 xfs_reflink_find_cow_mapping(
 xfs_reflink_find_cow_mapping(
 	struct xfs_inode		*ip,
 	struct xfs_inode		*ip,
 	xfs_off_t			offset,
 	xfs_off_t			offset,
-	struct xfs_bmbt_irec		*imap,
-	bool				*need_alloc)
+	struct xfs_bmbt_irec		*imap)
 {
 {
-	struct xfs_bmbt_irec		irec;
-	struct xfs_ifork		*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
-	xfs_fileoff_t			bno;
+	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	xfs_fileoff_t			offset_fsb;
+	struct xfs_bmbt_irec		got;
 	xfs_extnum_t			idx;
 	xfs_extnum_t			idx;
 
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
 	ASSERT(xfs_is_reflink_inode(ip));
 	ASSERT(xfs_is_reflink_inode(ip));
 
 
-	/* Find the extent in the CoW fork. */
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	bno = XFS_B_TO_FSBT(ip->i_mount, offset);
-	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
-	if (!gotp)
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
 		return false;
 		return false;
-
-	xfs_bmbt_get_all(gotp, &irec);
-	if (bno >= irec.br_startoff + irec.br_blockcount ||
-	    bno < irec.br_startoff)
+	if (got.br_startoff > offset_fsb)
 		return false;
 		return false;
 
 
 	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
 	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
-			&irec);
-
-	/* If it's still delalloc, we must allocate later. */
-	*imap = irec;
-	*need_alloc = !!(isnullstartblock(irec.br_startblock));
-
+			&got);
+	*imap = got;
 	return true;
 	return true;
 }
 }
 
 
 /*
 /*
  * Trim an extent to end at the next CoW reservation past offset_fsb.
  * Trim an extent to end at the next CoW reservation past offset_fsb.
  */
  */
-int
+void
 xfs_reflink_trim_irec_to_next_cow(
 xfs_reflink_trim_irec_to_next_cow(
 	struct xfs_inode		*ip,
 	struct xfs_inode		*ip,
 	xfs_fileoff_t			offset_fsb,
 	xfs_fileoff_t			offset_fsb,
 	struct xfs_bmbt_irec		*imap)
 	struct xfs_bmbt_irec		*imap)
 {
 {
-	struct xfs_bmbt_irec		irec;
-	struct xfs_ifork		*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_bmbt_irec		got;
 	xfs_extnum_t			idx;
 	xfs_extnum_t			idx;
 
 
 	if (!xfs_is_reflink_inode(ip))
 	if (!xfs_is_reflink_inode(ip))
-		return 0;
+		return;
 
 
 	/* Find the extent in the CoW fork. */
 	/* Find the extent in the CoW fork. */
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
-	if (!gotp)
-		return 0;
-	xfs_bmbt_get_all(gotp, &irec);
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+		return;
 
 
 	/* This is the extent before; try sliding up one. */
 	/* This is the extent before; try sliding up one. */
-	if (irec.br_startoff < offset_fsb) {
-		idx++;
-		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			return 0;
-		gotp = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_get_all(gotp, &irec);
+	if (got.br_startoff < offset_fsb) {
+		if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+			return;
 	}
 	}
 
 
-	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
-		return 0;
+	if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
+		return;
 
 
-	imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+	imap->br_blockcount = got.br_startoff - imap->br_startoff;
 	trace_xfs_reflink_trim_irec(ip, imap);
 	trace_xfs_reflink_trim_irec(ip, imap);
-
-	return 0;
 }
 }
 
 
 /*
 /*
@@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks(
 	xfs_fileoff_t			end_fsb)
 	xfs_fileoff_t			end_fsb)
 {
 {
 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec		got, prev, del;
+	struct xfs_bmbt_irec		got, del;
 	xfs_extnum_t			idx;
 	xfs_extnum_t			idx;
 	xfs_fsblock_t			firstfsb;
 	xfs_fsblock_t			firstfsb;
 	struct xfs_defer_ops		dfops;
 	struct xfs_defer_ops		dfops;
-	int				error = 0, eof = 0;
+	int				error = 0;
 
 
 	if (!xfs_is_reflink_inode(ip))
 	if (!xfs_is_reflink_inode(ip))
 		return 0;
 		return 0;
-
-	xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
-			&got, &prev);
-	if (eof)
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
 		return 0;
 		return 0;
 
 
 	while (got.br_startoff < end_fsb) {
 	while (got.br_startoff < end_fsb) {
@@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks(
 			xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
 			xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
 		}
 		}
 
 
-		if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+		if (!xfs_iext_get_extent(ifp, ++idx, &got))
 			break;
 			break;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
 	}
 	}
 
 
 	/* clear tag if cow fork is emptied */
 	/* clear tag if cow fork is emptied */
@@ -638,13 +591,13 @@ xfs_reflink_end_cow(
 	xfs_off_t			count)
 	xfs_off_t			count)
 {
 {
 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec		got, prev, del;
+	struct xfs_bmbt_irec		got, del;
 	struct xfs_trans		*tp;
 	struct xfs_trans		*tp;
 	xfs_fileoff_t			offset_fsb;
 	xfs_fileoff_t			offset_fsb;
 	xfs_fileoff_t			end_fsb;
 	xfs_fileoff_t			end_fsb;
 	xfs_fsblock_t			firstfsb;
 	xfs_fsblock_t			firstfsb;
 	struct xfs_defer_ops		dfops;
 	struct xfs_defer_ops		dfops;
-	int				error, eof = 0;
+	int				error;
 	unsigned int			resblks;
 	unsigned int			resblks;
 	xfs_filblks_t			rlen;
 	xfs_filblks_t			rlen;
 	xfs_extnum_t			idx;
 	xfs_extnum_t			idx;
@@ -668,13 +621,11 @@ xfs_reflink_end_cow(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_ijoin(tp, ip, 0);
 
 
-	xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
-			&got, &prev);
-
 	/* If there is a hole at end_fsb - 1 go to the previous extent */
 	/* If there is a hole at end_fsb - 1 go to the previous extent */
-	if (eof || got.br_startoff > end_fsb) {
+	if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
+	    got.br_startoff > end_fsb) {
 		ASSERT(idx > 0);
 		ASSERT(idx > 0);
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+		xfs_iext_get_extent(ifp, --idx, &got);
 	}
 	}
 
 
 	/* Walk backwards until we're out of the I/O range... */
 	/* Walk backwards until we're out of the I/O range... */
@@ -722,11 +673,9 @@ xfs_reflink_end_cow(
 		error = xfs_defer_finish(&tp, &dfops, ip);
 		error = xfs_defer_finish(&tp, &dfops, ip);
 		if (error)
 		if (error)
 			goto out_defer;
 			goto out_defer;
-
 next_extent:
 next_extent:
-		if (idx < 0)
+		if (!xfs_iext_get_extent(ifp, idx, &got))
 			break;
 			break;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
 	}
 	}
 
 
 	error = xfs_trans_commit(tp);
 	error = xfs_trans_commit(tp);
@@ -1302,13 +1251,11 @@ xfs_reflink_remap_range(
 		return -EIO;
 		return -EIO;
 
 
 	/* Lock both files against IO */
 	/* Lock both files against IO */
-	if (same_inode) {
-		xfs_ilock(src, XFS_IOLOCK_EXCL);
+	lock_two_nondirectories(inode_in, inode_out);
+	if (same_inode)
 		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
 		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
-	} else {
-		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+	else
 		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
 		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
-	}
 
 
 	/* Don't touch certain kinds of inodes */
 	/* Don't touch certain kinds of inodes */
 	ret = -EPERM;
 	ret = -EPERM;
@@ -1345,8 +1292,14 @@ xfs_reflink_remap_range(
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
 
 
-	if (len == 0)
+	/* Zero length dedupe exits immediately; reflink goes to EOF. */
+	if (len == 0) {
+		if (is_dedupe) {
+			ret = 0;
+			goto out_unlock;
+		}
 		len = isize - pos_in;
 		len = isize - pos_in;
+	}
 
 
 	/* Ensure offsets don't wrap and the input is inside i_size */
 	/* Ensure offsets don't wrap and the input is inside i_size */
 	if (pos_in + len < pos_in || pos_out + len < pos_out ||
 	if (pos_in + len < pos_in || pos_out + len < pos_out ||
@@ -1447,11 +1400,9 @@ xfs_reflink_remap_range(
 
 
 out_unlock:
 out_unlock:
 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
-	xfs_iunlock(src, XFS_IOLOCK_EXCL);
-	if (src->i_ino != dest->i_ino) {
+	if (!same_inode)
 		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
 		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
-	}
+	unlock_two_nondirectories(inode_in, inode_out);
 	if (ret)
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return ret;
 	return ret;
@@ -1697,37 +1648,3 @@ out:
 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
 	return error;
 	return error;
 }
 }
-
-/*
- * Does this inode have any real CoW reservations?
- */
-bool
-xfs_reflink_has_real_cow_blocks(
-	struct xfs_inode		*ip)
-{
-	struct xfs_bmbt_irec		irec;
-	struct xfs_ifork		*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
-	xfs_extnum_t			idx;
-
-	if (!xfs_is_reflink_inode(ip))
-		return false;
-
-	/* Go find the old extent in the CoW fork. */
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
-	while (gotp) {
-		xfs_bmbt_get_all(gotp, &irec);
-
-		if (!isnullstartblock(irec.br_startblock))
-			return true;
-
-		/* Roll on... */
-		idx++;
-		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			break;
-		gotp = xfs_iext_get_ext(ifp, idx);
-	}
-
-	return false;
-}

+ 2 - 4
fs/xfs/xfs_reflink.h

@@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
 extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
 		xfs_off_t offset, xfs_off_t count);
 		xfs_off_t offset, xfs_off_t count);
 extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
 extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
-		struct xfs_bmbt_irec *imap, bool *need_alloc);
-extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+		struct xfs_bmbt_irec *imap);
+extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
 		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
 		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
 
 
 extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
 extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
@@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
 extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
 extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len);
 		xfs_off_t len);
 
 
-extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
-
 #endif /* __XFS_REFLINK_H */
 #endif /* __XFS_REFLINK_H */

+ 5 - 5
fs/xfs/xfs_stats.c

@@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 	}
 	}
 	/* extra precision counters */
 	/* extra precision counters */
 	for_each_possible_cpu(i) {
 	for_each_possible_cpu(i) {
-		xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
-		xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
-		xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
+		xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+		xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+		xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
 	}
 	}
 
 
 	len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
 	len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
@@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
 	for_each_possible_cpu(c) {
 	for_each_possible_cpu(c) {
 		preempt_disable();
 		preempt_disable();
 		/* save vn_active, it's a universal truth! */
 		/* save vn_active, it's a universal truth! */
-		vn_active = per_cpu_ptr(stats, c)->vn_active;
+		vn_active = per_cpu_ptr(stats, c)->s.vn_active;
 		memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
 		memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
-		per_cpu_ptr(stats, c)->vn_active = vn_active;
+		per_cpu_ptr(stats, c)->s.vn_active = vn_active;
 		preempt_enable();
 		preempt_enable();
 	}
 	}
 }
 }

+ 81 - 119
fs/xfs/xfs_stats.h

@@ -21,10 +21,38 @@
 
 
 #include <linux/percpu.h>
 #include <linux/percpu.h>
 
 
+/*
+ * The btree stats arrays have fixed offsets for the different stats. We
+ * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and
+ * that allows us to use fixed offsets into the stats array for each btree
+ * stat. These index offsets are defined in the order they will be emitted
+ * in the stats files, so it is possible to add new btree stat types by
+ * appending to the enum list below.
+ */
+enum {
+	__XBTS_lookup = 0,
+	__XBTS_compare = 1,
+	__XBTS_insrec = 2,
+	__XBTS_delrec = 3,
+	__XBTS_newroot = 4,
+	__XBTS_killroot = 5,
+	__XBTS_increment = 6,
+	__XBTS_decrement = 7,
+	__XBTS_lshift = 8,
+	__XBTS_rshift = 9,
+	__XBTS_split = 10,
+	__XBTS_join = 11,
+	__XBTS_alloc = 12,
+	__XBTS_free = 13,
+	__XBTS_moves = 14,
+
+	__XBTS_MAX = 15,
+};
+
 /*
 /*
  * XFS global statistics
  * XFS global statistics
  */
  */
-struct xfsstats {
+struct __xfsstats {
 # define XFSSTAT_END_EXTENT_ALLOC	4
 # define XFSSTAT_END_EXTENT_ALLOC	4
 	__uint32_t		xs_allocx;
 	__uint32_t		xs_allocx;
 	__uint32_t		xs_allocb;
 	__uint32_t		xs_allocb;
@@ -117,118 +145,20 @@ struct xfsstats {
 	__uint32_t		xb_page_found;
 	__uint32_t		xb_page_found;
 	__uint32_t		xb_get_read;
 	__uint32_t		xb_get_read;
 /* Version 2 btree counters */
 /* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF+15)
-	__uint32_t		xs_abtb_2_lookup;
-	__uint32_t		xs_abtb_2_compare;
-	__uint32_t		xs_abtb_2_insrec;
-	__uint32_t		xs_abtb_2_delrec;
-	__uint32_t		xs_abtb_2_newroot;
-	__uint32_t		xs_abtb_2_killroot;
-	__uint32_t		xs_abtb_2_increment;
-	__uint32_t		xs_abtb_2_decrement;
-	__uint32_t		xs_abtb_2_lshift;
-	__uint32_t		xs_abtb_2_rshift;
-	__uint32_t		xs_abtb_2_split;
-	__uint32_t		xs_abtb_2_join;
-	__uint32_t		xs_abtb_2_alloc;
-	__uint32_t		xs_abtb_2_free;
-	__uint32_t		xs_abtb_2_moves;
-#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2+15)
-	__uint32_t		xs_abtc_2_lookup;
-	__uint32_t		xs_abtc_2_compare;
-	__uint32_t		xs_abtc_2_insrec;
-	__uint32_t		xs_abtc_2_delrec;
-	__uint32_t		xs_abtc_2_newroot;
-	__uint32_t		xs_abtc_2_killroot;
-	__uint32_t		xs_abtc_2_increment;
-	__uint32_t		xs_abtc_2_decrement;
-	__uint32_t		xs_abtc_2_lshift;
-	__uint32_t		xs_abtc_2_rshift;
-	__uint32_t		xs_abtc_2_split;
-	__uint32_t		xs_abtc_2_join;
-	__uint32_t		xs_abtc_2_alloc;
-	__uint32_t		xs_abtc_2_free;
-	__uint32_t		xs_abtc_2_moves;
-#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2+15)
-	__uint32_t		xs_bmbt_2_lookup;
-	__uint32_t		xs_bmbt_2_compare;
-	__uint32_t		xs_bmbt_2_insrec;
-	__uint32_t		xs_bmbt_2_delrec;
-	__uint32_t		xs_bmbt_2_newroot;
-	__uint32_t		xs_bmbt_2_killroot;
-	__uint32_t		xs_bmbt_2_increment;
-	__uint32_t		xs_bmbt_2_decrement;
-	__uint32_t		xs_bmbt_2_lshift;
-	__uint32_t		xs_bmbt_2_rshift;
-	__uint32_t		xs_bmbt_2_split;
-	__uint32_t		xs_bmbt_2_join;
-	__uint32_t		xs_bmbt_2_alloc;
-	__uint32_t		xs_bmbt_2_free;
-	__uint32_t		xs_bmbt_2_moves;
-#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2+15)
-	__uint32_t		xs_ibt_2_lookup;
-	__uint32_t		xs_ibt_2_compare;
-	__uint32_t		xs_ibt_2_insrec;
-	__uint32_t		xs_ibt_2_delrec;
-	__uint32_t		xs_ibt_2_newroot;
-	__uint32_t		xs_ibt_2_killroot;
-	__uint32_t		xs_ibt_2_increment;
-	__uint32_t		xs_ibt_2_decrement;
-	__uint32_t		xs_ibt_2_lshift;
-	__uint32_t		xs_ibt_2_rshift;
-	__uint32_t		xs_ibt_2_split;
-	__uint32_t		xs_ibt_2_join;
-	__uint32_t		xs_ibt_2_alloc;
-	__uint32_t		xs_ibt_2_free;
-	__uint32_t		xs_ibt_2_moves;
-#define XFSSTAT_END_FIBT_V2		(XFSSTAT_END_IBT_V2+15)
-	__uint32_t		xs_fibt_2_lookup;
-	__uint32_t		xs_fibt_2_compare;
-	__uint32_t		xs_fibt_2_insrec;
-	__uint32_t		xs_fibt_2_delrec;
-	__uint32_t		xs_fibt_2_newroot;
-	__uint32_t		xs_fibt_2_killroot;
-	__uint32_t		xs_fibt_2_increment;
-	__uint32_t		xs_fibt_2_decrement;
-	__uint32_t		xs_fibt_2_lshift;
-	__uint32_t		xs_fibt_2_rshift;
-	__uint32_t		xs_fibt_2_split;
-	__uint32_t		xs_fibt_2_join;
-	__uint32_t		xs_fibt_2_alloc;
-	__uint32_t		xs_fibt_2_free;
-	__uint32_t		xs_fibt_2_moves;
-#define XFSSTAT_END_RMAP_V2		(XFSSTAT_END_FIBT_V2+15)
-	__uint32_t		xs_rmap_2_lookup;
-	__uint32_t		xs_rmap_2_compare;
-	__uint32_t		xs_rmap_2_insrec;
-	__uint32_t		xs_rmap_2_delrec;
-	__uint32_t		xs_rmap_2_newroot;
-	__uint32_t		xs_rmap_2_killroot;
-	__uint32_t		xs_rmap_2_increment;
-	__uint32_t		xs_rmap_2_decrement;
-	__uint32_t		xs_rmap_2_lshift;
-	__uint32_t		xs_rmap_2_rshift;
-	__uint32_t		xs_rmap_2_split;
-	__uint32_t		xs_rmap_2_join;
-	__uint32_t		xs_rmap_2_alloc;
-	__uint32_t		xs_rmap_2_free;
-	__uint32_t		xs_rmap_2_moves;
-#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + 15)
-	__uint32_t		xs_refcbt_2_lookup;
-	__uint32_t		xs_refcbt_2_compare;
-	__uint32_t		xs_refcbt_2_insrec;
-	__uint32_t		xs_refcbt_2_delrec;
-	__uint32_t		xs_refcbt_2_newroot;
-	__uint32_t		xs_refcbt_2_killroot;
-	__uint32_t		xs_refcbt_2_increment;
-	__uint32_t		xs_refcbt_2_decrement;
-	__uint32_t		xs_refcbt_2_lshift;
-	__uint32_t		xs_refcbt_2_rshift;
-	__uint32_t		xs_refcbt_2_split;
-	__uint32_t		xs_refcbt_2_join;
-	__uint32_t		xs_refcbt_2_alloc;
-	__uint32_t		xs_refcbt_2_free;
-	__uint32_t		xs_refcbt_2_moves;
+#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF + __XBTS_MAX)
+	__uint32_t		xs_abtb_2[__XBTS_MAX];
+#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
+	__uint32_t		xs_abtc_2[__XBTS_MAX];
+#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
+	__uint32_t		xs_bmbt_2[__XBTS_MAX];
+#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
+	__uint32_t		xs_ibt_2[__XBTS_MAX];
+#define XFSSTAT_END_FIBT_V2		(XFSSTAT_END_IBT_V2 + __XBTS_MAX)
+	__uint32_t		xs_fibt_2[__XBTS_MAX];
+#define XFSSTAT_END_RMAP_V2		(XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
+	__uint32_t		xs_rmap_2[__XBTS_MAX];
+#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
+	__uint32_t		xs_refcbt_2[__XBTS_MAX];
 #define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)
 #define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)
 	__uint32_t		xs_qm_dqreclaims;
 	__uint32_t		xs_qm_dqreclaims;
 	__uint32_t		xs_qm_dqreclaim_misses;
 	__uint32_t		xs_qm_dqreclaim_misses;
@@ -245,26 +175,58 @@ struct xfsstats {
 	__uint64_t		xs_read_bytes;
 	__uint64_t		xs_read_bytes;
 };
 };
 
 
+struct xfsstats {
+	union {
+		struct __xfsstats	s;
+		uint32_t		a[XFSSTAT_END_XQMSTAT];
+	};
+};
+
+/*
+ * simple wrapper for getting the array index of s struct member offset
+ */
+#define XFS_STATS_CALC_INDEX(member)	\
+	(offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t))
+
+
 int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
 int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
 void xfs_stats_clearall(struct xfsstats __percpu *stats);
 void xfs_stats_clearall(struct xfsstats __percpu *stats);
 extern struct xstats xfsstats;
 extern struct xstats xfsstats;
 
 
 #define XFS_STATS_INC(mp, v)					\
 #define XFS_STATS_INC(mp, v)					\
 do {								\
 do {								\
-	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++;	\
-	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++;	\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++;	\
 } while (0)
 } while (0)
 
 
 #define XFS_STATS_DEC(mp, v)					\
 #define XFS_STATS_DEC(mp, v)					\
 do {								\
 do {								\
-	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--;	\
-	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--;	\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--;	\
 } while (0)
 } while (0)
 
 
 #define XFS_STATS_ADD(mp, v, inc)					\
 #define XFS_STATS_ADD(mp, v, inc)					\
 do {									\
 do {									\
-	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc);	\
-	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc);	\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc);	\
+} while (0)
+
+#define XFS_STATS_INC_OFF(mp, off)				\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++;	\
+} while (0)
+
+#define XFS_STATS_DEC_OFF(mp, off)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off];	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off];	\
+} while (0)
+
+#define XFS_STATS_ADD_OFF(mp, off, inc)					\
+do {									\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc);	\
 } while (0)
 } while (0)
 
 
 #if defined(CONFIG_PROC_FS)
 #if defined(CONFIG_PROC_FS)

+ 17 - 10
fs/xfs/xfs_super.c

@@ -104,9 +104,6 @@ static const match_table_t tokens = {
 	{Opt_sysvgroups,"sysvgroups"},	/* group-ID from current process */
 	{Opt_sysvgroups,"sysvgroups"},	/* group-ID from current process */
 	{Opt_allocsize,	"allocsize=%s"},/* preferred allocation size */
 	{Opt_allocsize,	"allocsize=%s"},/* preferred allocation size */
 	{Opt_norecovery,"norecovery"},	/* don't run XFS recovery */
 	{Opt_norecovery,"norecovery"},	/* don't run XFS recovery */
-	{Opt_barrier,	"barrier"},	/* use writer barriers for log write and
-					 * unwritten extent conversion */
-	{Opt_nobarrier,	"nobarrier"},	/* .. disable */
 	{Opt_inode64,	"inode64"},	/* inodes can be allocated anywhere */
 	{Opt_inode64,	"inode64"},	/* inodes can be allocated anywhere */
 	{Opt_inode32,   "inode32"},	/* inode allocation limited to
 	{Opt_inode32,   "inode32"},	/* inode allocation limited to
 					 * XFS_MAXINUMBER_32 */
 					 * XFS_MAXINUMBER_32 */
@@ -134,6 +131,12 @@ static const match_table_t tokens = {
 	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
 	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
 
 
 	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
 	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
+
+	/* Deprecated mount options scheduled for removal */
+	{Opt_barrier,	"barrier"},	/* use writer barriers for log write and
+					 * unwritten extent conversion */
+	{Opt_nobarrier,	"nobarrier"},	/* .. disable */
+
 	{Opt_err,	NULL},
 	{Opt_err,	NULL},
 };
 };
 
 
@@ -301,12 +304,6 @@ xfs_parseargs(
 		case Opt_nouuid:
 		case Opt_nouuid:
 			mp->m_flags |= XFS_MOUNT_NOUUID;
 			mp->m_flags |= XFS_MOUNT_NOUUID;
 			break;
 			break;
-		case Opt_barrier:
-			mp->m_flags |= XFS_MOUNT_BARRIER;
-			break;
-		case Opt_nobarrier:
-			mp->m_flags &= ~XFS_MOUNT_BARRIER;
-			break;
 		case Opt_ikeep:
 		case Opt_ikeep:
 			mp->m_flags |= XFS_MOUNT_IKEEP;
 			mp->m_flags |= XFS_MOUNT_IKEEP;
 			break;
 			break;
@@ -374,6 +371,14 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DAX;
 			mp->m_flags |= XFS_MOUNT_DAX;
 			break;
 			break;
 #endif
 #endif
+		case Opt_barrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+			break;
+		case Opt_nobarrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+			break;
 		default:
 		default:
 			xfs_warn(mp, "unknown mount option [%s].", p);
 			xfs_warn(mp, "unknown mount option [%s].", p);
 			return -EINVAL;
 			return -EINVAL;
@@ -943,7 +948,7 @@ xfs_fs_destroy_inode(
 
 
 	trace_xfs_destroy_inode(ip);
 	trace_xfs_destroy_inode(ip);
 
 
-	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 	XFS_STATS_INC(ip->i_mount, vn_rele);
 	XFS_STATS_INC(ip->i_mount, vn_rele);
 	XFS_STATS_INC(ip->i_mount, vn_remove);
 	XFS_STATS_INC(ip->i_mount, vn_remove);
 
 
@@ -1238,9 +1243,11 @@ xfs_fs_remount(
 		token = match_token(p, tokens, args);
 		token = match_token(p, tokens, args);
 		switch (token) {
 		switch (token) {
 		case Opt_barrier:
 		case Opt_barrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
 			mp->m_flags |= XFS_MOUNT_BARRIER;
 			mp->m_flags |= XFS_MOUNT_BARRIER;
 			break;
 			break;
 		case Opt_nobarrier:
 		case Opt_nobarrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 			break;
 			break;
 		case Opt_inode64:
 		case Opt_inode64:

+ 3 - 4
fs/xfs/xfs_symlink.c

@@ -238,8 +238,7 @@ xfs_symlink(
 	if (error)
 	if (error)
 		goto out_release_inode;
 		goto out_release_inode;
 
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
-		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
 	unlock_dp_on_error = true;
 
 
 	/*
 	/*
@@ -287,7 +286,7 @@ xfs_symlink(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 * error path.
 	 */
 	 */
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = false;
 	unlock_dp_on_error = false;
 
 
 	/*
 	/*
@@ -412,7 +411,7 @@ out_release_inode:
 	xfs_qm_dqrele(pdqp);
 	xfs_qm_dqrele(pdqp);
 
 
 	if (unlock_dp_on_error)
 	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 	return error;
 }
 }
 
 

+ 0 - 109
fs/xfs/xfs_trace.h

@@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_submit);
 DEFINE_BUF_EVENT(xfs_buf_submit);
 DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_submit_wait);
-DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
 DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
@@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
-DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
-DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
 
 
 /* not really buffer traces, but the buf provides useful information */
 /* not really buffer traces, but the buf provides useful information */
 DEFINE_BUF_EVENT(xfs_btree_corrupt);
 DEFINE_BUF_EVENT(xfs_btree_corrupt);
-DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
 DEFINE_BUF_EVENT(xfs_reset_dqcounts);
 DEFINE_BUF_EVENT(xfs_reset_dqcounts);
-DEFINE_BUF_EVENT(xfs_inode_item_push);
 
 
 /* pass flags explicitly */
 /* pass flags explicitly */
 DECLARE_EVENT_CLASS(xfs_buf_flags_class,
 DECLARE_EVENT_CLASS(xfs_buf_flags_class,
@@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
 
 
 DECLARE_EVENT_CLASS(xfs_filestream_class,
 DECLARE_EVENT_CLASS(xfs_filestream_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
 	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
@@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
 DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_evict_inode);
 DEFINE_INODE_EVENT(xfs_update_time);
 DEFINE_INODE_EVENT(xfs_update_time);
 
 
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post,
 DEFINE_EVENT(xfs_iref_class, name, \
 DEFINE_EVENT(xfs_iref_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
 	TP_ARGS(ip, caller_ip))
 	TP_ARGS(ip, caller_ip))
-DEFINE_IREF_EVENT(xfs_ihold);
 DEFINE_IREF_EVENT(xfs_irele);
 DEFINE_IREF_EVENT(xfs_irele);
 DEFINE_IREF_EVENT(xfs_inode_pin);
 DEFINE_IREF_EVENT(xfs_inode_pin);
 DEFINE_IREF_EVENT(xfs_inode_unpin);
 DEFINE_IREF_EVENT(xfs_inode_unpin);
@@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss);
 DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
 DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
 DEFINE_DQUOT_EVENT(xfs_dqget_dup);
 DEFINE_DQUOT_EVENT(xfs_dqget_dup);
 DEFINE_DQUOT_EVENT(xfs_dqput);
 DEFINE_DQUOT_EVENT(xfs_dqput);
-DEFINE_DQUOT_EVENT(xfs_dqput_wait);
 DEFINE_DQUOT_EVENT(xfs_dqput_free);
 DEFINE_DQUOT_EVENT(xfs_dqput_free);
 DEFINE_DQUOT_EVENT(xfs_dqrele);
 DEFINE_DQUOT_EVENT(xfs_dqrele);
 DEFINE_DQUOT_EVENT(xfs_dqflush);
 DEFINE_DQUOT_EVENT(xfs_dqflush);
@@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
 DEFINE_ATTR_EVENT(xfs_attr_sf_create);
 DEFINE_ATTR_EVENT(xfs_attr_sf_create);
 DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
 DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
-DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
 DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
 DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
 
 
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
@@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
 
 
 DEFINE_ATTR_EVENT(xfs_attr_node_addname);
 DEFINE_ATTR_EVENT(xfs_attr_node_addname);
 DEFINE_ATTR_EVENT(xfs_attr_node_get);
 DEFINE_ATTR_EVENT(xfs_attr_node_get);
-DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_node_replace);
 DEFINE_ATTR_EVENT(xfs_attr_node_replace);
 DEFINE_ATTR_EVENT(xfs_attr_node_removename);
 DEFINE_ATTR_EVENT(xfs_attr_node_removename);
 
 
@@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done);
 
 
 DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
 DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
 DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
 DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error);
 
 
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
@@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name,	\
 		 struct xfs_inode *dest, xfs_off_t doffset), \
 		 struct xfs_inode *dest, xfs_off_t doffset), \
 	TP_ARGS(src, soffset, len, dest, doffset))
 	TP_ARGS(src, soffset, len, dest, doffset))
 
 
-/* two-file vfs io tracepoint class */
-DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
-	TP_PROTO(struct inode *src, u64 soffset, u64 len,
-		 struct inode *dest, u64 doffset),
-	TP_ARGS(src, soffset, len, dest, doffset),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(unsigned long, src_ino)
-		__field(loff_t, src_isize)
-		__field(loff_t, src_offset)
-		__field(size_t, len)
-		__field(unsigned long, dest_ino)
-		__field(loff_t, dest_isize)
-		__field(loff_t, dest_offset)
-	),
-	TP_fast_assign(
-		__entry->dev = src->i_sb->s_dev;
-		__entry->src_ino = src->i_ino;
-		__entry->src_isize = i_size_read(src);
-		__entry->src_offset = soffset;
-		__entry->len = len;
-		__entry->dest_ino = dest->i_ino;
-		__entry->dest_isize = i_size_read(dest);
-		__entry->dest_offset = doffset;
-	),
-	TP_printk("dev %d:%d count %zd "
-		  "ino 0x%lx isize 0x%llx offset 0x%llx -> "
-		  "ino 0x%lx isize 0x%llx offset 0x%llx",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->len,
-		  __entry->src_ino,
-		  __entry->src_isize,
-		  __entry->src_offset,
-		  __entry->dest_ino,
-		  __entry->dest_isize,
-		  __entry->dest_offset)
-)
-
-#define DEFINE_DOUBLE_VFS_IO_EVENT(name)	\
-DEFINE_EVENT(xfs_double_vfs_io_class, name,	\
-	TP_PROTO(struct inode *src, u64 soffset, u64 len, \
-		 struct inode *dest, u64 doffset), \
-	TP_ARGS(src, soffset, len, dest, doffset))
-
-/* CoW write tracepoint */
-DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
-	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
-		 xfs_extlen_t len, xfs_fsblock_t new_pblk),
-	TP_ARGS(ip, lblk, pblk, len, new_pblk),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(xfs_ino_t, ino)
-		__field(xfs_fileoff_t, lblk)
-		__field(xfs_fsblock_t, pblk)
-		__field(xfs_extlen_t, len)
-		__field(xfs_fsblock_t, new_pblk)
-	),
-	TP_fast_assign(
-		__entry->dev = VFS_I(ip)->i_sb->s_dev;
-		__entry->ino = ip->i_ino;
-		__entry->lblk = lblk;
-		__entry->pblk = pblk;
-		__entry->len = len;
-		__entry->new_pblk = new_pblk;
-	),
-	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
-		  "len 0x%x new_pblk %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->ino,
-		  __entry->lblk,
-		  __entry->pblk,
-		  __entry->len,
-		  __entry->new_pblk)
-)
-
-#define DEFINE_COW_EVENT(name)	\
-DEFINE_EVENT(xfs_copy_on_write_class, name,	\
-	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
-		 xfs_extlen_t len, xfs_fsblock_t new_pblk), \
-	TP_ARGS(ip, lblk, pblk, len, new_pblk))
-
 /* inode/irec events */
 /* inode/irec events */
 DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
 	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
@@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
 
 
@@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
 
 
 /* ioctl tracepoints */
 /* ioctl tracepoints */
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
 TRACE_EVENT(xfs_ioctl_clone,
 TRACE_EVENT(xfs_ioctl_clone,
 	TP_PROTO(struct inode *src, struct inode *dest),
 	TP_PROTO(struct inode *src, struct inode *dest),
 	TP_ARGS(src, dest),
 	TP_ARGS(src, dest),
@@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 
 
 /* unshare tracepoints */
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
-DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
 
 
 /* copy on write */
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
@@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
 
 
-DEFINE_COW_EVENT(xfs_reflink_fork_buf);
-DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
 
 
-DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
 
 
 /* rmap swapext tracepoints */
 /* rmap swapext tracepoints */
 DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
 DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);

+ 10 - 13
fs/xfs/xfs_xattr.c

@@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
 	NULL
 	NULL
 };
 };
 
 
-static int
+static void
 __xfs_xattr_put_listent(
 __xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
 	struct xfs_attr_list_context *context,
 	char *prefix,
 	char *prefix,
@@ -148,7 +148,7 @@ __xfs_xattr_put_listent(
 	if (arraytop > context->firstu) {
 	if (arraytop > context->firstu) {
 		context->count = -1;	/* insufficient space */
 		context->count = -1;	/* insufficient space */
 		context->seen_enough = 1;
 		context->seen_enough = 1;
-		return 0;
+		return;
 	}
 	}
 	offset = (char *)context->alist + context->count;
 	offset = (char *)context->alist + context->count;
 	strncpy(offset, prefix, prefix_len);
 	strncpy(offset, prefix, prefix_len);
@@ -159,10 +159,10 @@ __xfs_xattr_put_listent(
 
 
 compute_size:
 compute_size:
 	context->count += prefix_len + namelen + 1;
 	context->count += prefix_len + namelen + 1;
-	return 0;
+	return;
 }
 }
 
 
-static int
+static void
 xfs_xattr_put_listent(
 xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
 	struct xfs_attr_list_context *context,
 	int		flags,
 	int		flags,
@@ -180,23 +180,19 @@ xfs_xattr_put_listent(
 		if (namelen == SGI_ACL_FILE_SIZE &&
 		if (namelen == SGI_ACL_FILE_SIZE &&
 		    strncmp(name, SGI_ACL_FILE,
 		    strncmp(name, SGI_ACL_FILE,
 			    SGI_ACL_FILE_SIZE) == 0) {
 			    SGI_ACL_FILE_SIZE) == 0) {
-			int ret = __xfs_xattr_put_listent(
+			__xfs_xattr_put_listent(
 					context, XATTR_SYSTEM_PREFIX,
 					context, XATTR_SYSTEM_PREFIX,
 					XATTR_SYSTEM_PREFIX_LEN,
 					XATTR_SYSTEM_PREFIX_LEN,
 					XATTR_POSIX_ACL_ACCESS,
 					XATTR_POSIX_ACL_ACCESS,
 					strlen(XATTR_POSIX_ACL_ACCESS));
 					strlen(XATTR_POSIX_ACL_ACCESS));
-			if (ret)
-				return ret;
 		} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
 		} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
 			 strncmp(name, SGI_ACL_DEFAULT,
 			 strncmp(name, SGI_ACL_DEFAULT,
 				 SGI_ACL_DEFAULT_SIZE) == 0) {
 				 SGI_ACL_DEFAULT_SIZE) == 0) {
-			int ret = __xfs_xattr_put_listent(
+			__xfs_xattr_put_listent(
 					context, XATTR_SYSTEM_PREFIX,
 					context, XATTR_SYSTEM_PREFIX,
 					XATTR_SYSTEM_PREFIX_LEN,
 					XATTR_SYSTEM_PREFIX_LEN,
 					XATTR_POSIX_ACL_DEFAULT,
 					XATTR_POSIX_ACL_DEFAULT,
 					strlen(XATTR_POSIX_ACL_DEFAULT));
 					strlen(XATTR_POSIX_ACL_DEFAULT));
-			if (ret)
-				return ret;
 		}
 		}
 #endif
 #endif
 
 
@@ -205,7 +201,7 @@ xfs_xattr_put_listent(
 		 * see them.
 		 * see them.
 		 */
 		 */
 		if (!capable(CAP_SYS_ADMIN))
 		if (!capable(CAP_SYS_ADMIN))
-			return 0;
+			return;
 
 
 		prefix = XATTR_TRUSTED_PREFIX;
 		prefix = XATTR_TRUSTED_PREFIX;
 		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
@@ -217,8 +213,9 @@ xfs_xattr_put_listent(
 		prefix_len = XATTR_USER_PREFIX_LEN;
 		prefix_len = XATTR_USER_PREFIX_LEN;
 	}
 	}
 
 
-	return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
-				       namelen);
+	__xfs_xattr_put_listent(context, prefix, prefix_len, name,
+				namelen);
+	return;
 }
 }
 
 
 ssize_t
 ssize_t

+ 11 - 0
include/linux/iomap.h

@@ -50,6 +50,7 @@ struct iomap {
 #define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
 #define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
 #define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
+#define IOMAP_DIRECT		(1 << 4) /* direct I/O */
 
 
 struct iomap_ops {
 struct iomap_ops {
 	/*
 	/*
@@ -83,4 +84,14 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		loff_t start, loff_t len, struct iomap_ops *ops);
 		loff_t start, loff_t len, struct iomap_ops *ops);
 
 
+/*
+ * Flags for direct I/O ->end_io:
+ */
+#define IOMAP_DIO_UNWRITTEN	(1 << 0)	/* covers unwritten extent(s) */
+#define IOMAP_DIO_COW		(1 << 1)	/* covers COW extent(s) */
+typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
+		unsigned flags);
+ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops, iomap_dio_end_io_t end_io);
+
 #endif /* LINUX_IOMAP_H */
 #endif /* LINUX_IOMAP_H */

+ 23 - 2
include/linux/lockdep.h

@@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 extern void lock_release(struct lockdep_map *lock, int nested,
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 			 unsigned long ip);
 
 
-#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
+/*
+ * Same "read" as for lock_acquire(), except -1 means any.
+ */
+extern int lock_is_held_type(struct lockdep_map *lock, int read);
+
+static inline int lock_is_held(struct lockdep_map *lock)
+{
+	return lock_is_held_type(lock, -1);
+}
 
 
-extern int lock_is_held(struct lockdep_map *lock);
+#define lockdep_is_held(lock)		lock_is_held(&(lock)->dep_map)
+#define lockdep_is_held_type(lock, r)	lock_is_held_type(&(lock)->dep_map, (r))
 
 
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 			   struct lock_class_key *key, unsigned int subclass,
 			   struct lock_class_key *key, unsigned int subclass,
@@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 	} while (0)
 
 
+#define lockdep_assert_held_exclusive(l)	do {			\
+		WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));	\
+	} while (0)
+
+#define lockdep_assert_held_read(l)	do {				\
+		WARN_ON(debug_locks && !lockdep_is_held_type(l, 1));	\
+	} while (0)
+
 #define lockdep_assert_held_once(l)	do {				\
 #define lockdep_assert_held_once(l)	do {				\
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 	} while (0)
@@ -428,7 +445,11 @@ struct lock_class_key { };
 
 
 #define lockdep_depth(tsk)	(0)
 #define lockdep_depth(tsk)	(0)
 
 
+#define lockdep_is_held_type(l, r)		(1)
+
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
+#define lockdep_assert_held_exclusive(l)	do { (void)(l); } while (0)
+#define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 
 
 #define lockdep_recursing(tsk)			(0)
 #define lockdep_recursing(tsk)			(0)

+ 12 - 8
kernel/locking/lockdep.c

@@ -3191,7 +3191,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
 	return 0;
 	return 0;
 }
 }
 
 
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
 
 
 /*
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
  * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3332,7 +3332,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	}
 	}
 	chain_key = iterate_chain_key(chain_key, class_idx);
 	chain_key = iterate_chain_key(chain_key, class_idx);
 
 
-	if (nest_lock && !__lock_is_held(nest_lock))
+	if (nest_lock && !__lock_is_held(nest_lock, -1))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
 
 
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3579,7 +3579,7 @@ found_it:
 	return 1;
 	return 1;
 }
 }
 
 
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
 {
 {
 	struct task_struct *curr = current;
 	struct task_struct *curr = current;
 	int i;
 	int i;
@@ -3587,8 +3587,12 @@ static int __lock_is_held(struct lockdep_map *lock)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 		struct held_lock *hlock = curr->held_locks + i;
 
 
-		if (match_held_lock(hlock, lock))
-			return 1;
+		if (match_held_lock(hlock, lock)) {
+			if (read == -1 || hlock->read == read)
+				return 1;
+
+			return 0;
+		}
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -3772,7 +3776,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 }
 EXPORT_SYMBOL_GPL(lock_release);
 EXPORT_SYMBOL_GPL(lock_release);
 
 
-int lock_is_held(struct lockdep_map *lock)
+int lock_is_held_type(struct lockdep_map *lock, int read)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 	int ret = 0;
 	int ret = 0;
@@ -3784,13 +3788,13 @@ int lock_is_held(struct lockdep_map *lock)
 	check_flags(flags);
 	check_flags(flags);
 
 
 	current->lockdep_recursion = 1;
 	current->lockdep_recursion = 1;
-	ret = __lock_is_held(lock);
+	ret = __lock_is_held(lock, read);
 	current->lockdep_recursion = 0;
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 	raw_local_irq_restore(flags);
 
 
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(lock_is_held_type);
 
 
 struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {
 {