9 gadi atpakaļ · 0e6acf0204
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
 
				 
			
 
				 if BLOCK
			
 
				 
			
 
				+config FS_IOMAP
			
 
				+	bool
			
 
				+
			
 
				 source "fs/ext2/Kconfig"
			
 
				 source "fs/ext4/Kconfig"
			
 
				 source "fs/jbd2/Kconfig"
			
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 
				 obj-$(CONFIG_SYSCTL)		+= drop_caches.o
			
 
				 
			
 
				 obj-$(CONFIG_FHANDLE)		+= fhandle.o
			
 
				+obj-$(CONFIG_FS_IOMAP)		+= iomap.o
			
 
				 
			
 
				 obj-y				+= quota/
			
 
				 
			
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
 
				 #include <linux/kernel.h>
			
 
				 #include <linux/syscalls.h>
			
 
				 #include <linux/fs.h>
			
 
				+#include <linux/iomap.h>
			
 
				 #include <linux/mm.h>
			
 
				 #include <linux/percpu.h>
			
 
				 #include <linux/slab.h>
			
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 
				 }
			
 
				 EXPORT_SYMBOL(page_zero_new_buffers);
			
 
				 
			
 
				-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
			
 
				-		get_block_t *get_block)
			
 
				+static void
			
 
				+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
			
 
				+		struct iomap *iomap)
			
 
				+{
			
 
				+	loff_t offset = block << inode->i_blkbits;
			
 
				+
			
 
				+	bh->b_bdev = iomap->bdev;
			
 
				+
			
 
				+	/*
			
 
				+	 * Block points to offset in file we need to map, iomap contains
			
 
				+	 * the offset at which the map starts. If the map ends before the
			
 
				+	 * current block, then do not map the buffer and let the caller
			
 
				+	 * handle it.
			
 
				+	 */
			
 
				+	BUG_ON(offset >= iomap->offset + iomap->length);
			
 
				+
			
 
				+	switch (iomap->type) {
			
 
				+	case IOMAP_HOLE:
			
 
				+		/*
			
 
				+		 * If the buffer is not up to date or beyond the current EOF,
			
 
				+		 * we need to mark it as new to ensure sub-block zeroing is
			
 
				+		 * executed if necessary.
			
 
				+		 */
			
 
				+		if (!buffer_uptodate(bh) ||
			
 
				+		    (offset >= i_size_read(inode)))
			
 
				+			set_buffer_new(bh);
			
 
				+		break;
			
 
				+	case IOMAP_DELALLOC:
			
 
				+		if (!buffer_uptodate(bh) ||
			
 
				+		    (offset >= i_size_read(inode)))
			
 
				+			set_buffer_new(bh);
			
 
				+		set_buffer_uptodate(bh);
			
 
				+		set_buffer_mapped(bh);
			
 
				+		set_buffer_delay(bh);
			
 
				+		break;
			
 
				+	case IOMAP_UNWRITTEN:
			
 
				+		/*
			
 
				+		 * For unwritten regions, we always need to ensure that
			
 
				+		 * sub-block writes cause the regions in the block we are not
			
 
				+		 * writing to are zeroed. Set the buffer as new to ensure this.
			
 
				+		 */
			
 
				+		set_buffer_new(bh);
			
 
				+		set_buffer_unwritten(bh);
			
 
				+		/* FALLTHRU */
			
 
				+	case IOMAP_MAPPED:
			
 
				+		if (offset >= i_size_read(inode))
			
 
				+			set_buffer_new(bh);
			
 
				+		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
			
 
				+				((offset - iomap->offset) >> inode->i_blkbits);
			
 
				+		set_buffer_mapped(bh);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
			
 
				+		get_block_t *get_block, struct iomap *iomap)
			
 
				 {
			
 
				 	unsigned from = pos & (PAGE_SIZE - 1);
			
 
				 	unsigned to = from + len;
			
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 
				 			clear_buffer_new(bh);
			
 
				 		if (!buffer_mapped(bh)) {
			
 
				 			WARN_ON(bh->b_size != blocksize);
			
 
				-			err = get_block(inode, block, bh, 1);
			
 
				-			if (err)
			
 
				-				break;
			
 
				+			if (get_block) {
			
 
				+				err = get_block(inode, block, bh, 1);
			
 
				+				if (err)
			
 
				+					break;
			
 
				+			} else {
			
 
				+				iomap_to_bh(inode, block, bh, iomap);
			
 
				+			}
			
 
				+
			
 
				 			if (buffer_new(bh)) {
			
 
				 				unmap_underlying_metadata(bh->b_bdev,
			
 
				 							bh->b_blocknr);
			
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 
				 		page_zero_new_buffers(page, from, to);
			
 
				 	return err;
			
 
				 }
			
 
				+
			
 
				+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
			
 
				+		get_block_t *get_block)
			
 
				+{
			
 
				+	return __block_write_begin_int(page, pos, len, get_block, NULL);
			
 
				+}
			
 
				 EXPORT_SYMBOL(__block_write_begin);
			
 
				 
			
 
				 static int __block_commit_write(struct inode *inode, struct page *page,
			
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
				 
			
 
				 struct super_block;
			
 
				 struct file_system_type;
			
 
				+struct iomap;
			
 
				 struct linux_binprm;
			
 
				 struct path;
			
 
				 struct mount;
			
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 
				  * buffer.c
			
 
				  */
			
 
				 extern void guard_bio_eod(int rw, struct bio *bio);
			
 
				+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
			
 
				+		get_block_t *get_block, struct iomap *iomap);
			
 
				 
			
 
				 /*
			
 
				  * char_dev.c
			
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
 
				+/*
			
 
				+ * Copyright (C) 2010 Red Hat, Inc.
			
 
				+ * Copyright (c) 2016 Christoph Hellwig.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify it
			
 
				+ * under the terms and conditions of the GNU General Public License,
			
 
				+ * version 2, as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope it will be useful, but WITHOUT
			
 
				+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
			
 
				+ * more details.
			
 
				+ */
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/compiler.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/iomap.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+#include <linux/gfp.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/swap.h>
			
 
				+#include <linux/pagemap.h>
			
 
				+#include <linux/file.h>
			
 
				+#include <linux/uio.h>
			
 
				+#include <linux/backing-dev.h>
			
 
				+#include <linux/buffer_head.h>
			
 
				+#include <linux/dax.h>
			
 
				+#include "internal.h"
			
 
				+
			
 
				+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
			
 
				+		void *data, struct iomap *iomap);
			
 
				+
			
 
				+/*
			
 
				+ * Execute a iomap write on a segment of the mapping that spans a
			
 
				+ * contiguous range of pages that have identical block mapping state.
			
 
				+ *
			
 
				+ * This avoids the need to map pages individually, do individual allocations
			
 
				+ * for each page and most importantly avoid the need for filesystem specific
			
 
				+ * locking per page. Instead, all the operations are amortised over the entire
			
 
				+ * range of pages. It is assumed that the filesystems will lock whatever
			
 
				+ * resources they require in the iomap_begin call, and release them in the
			
 
				+ * iomap_end call.
			
 
				+ */
			
 
				+static loff_t
			
 
				+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
			
 
				+		struct iomap_ops *ops, void *data, iomap_actor_t actor)
			
 
				+{
			
 
				+	struct iomap iomap = { 0 };
			
 
				+	loff_t written = 0, ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * Need to map a range from start position for length bytes. This can
			
 
				+	 * span multiple pages - it is only guaranteed to return a range of a
			
 
				+	 * single type of pages (e.g. all into a hole, all mapped or all
			
 
				+	 * unwritten). Failure at this point has nothing to undo.
			
 
				+	 *
			
 
				+	 * If allocation is required for this range, reserve the space now so
			
 
				+	 * that the allocation is guaranteed to succeed later on. Once we copy
			
 
				+	 * the data into the page cache pages, then we cannot fail otherwise we
			
 
				+	 * expose transient stale data. If the reserve fails, we can safely
			
 
				+	 * back out at this point as there is nothing to undo.
			
 
				+	 */
			
 
				+	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	if (WARN_ON(iomap.offset > pos))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	/*
			
 
				+	 * Cut down the length to the one actually provided by the filesystem,
			
 
				+	 * as it might not be able to give us the whole size that we requested.
			
 
				+	 */
			
 
				+	if (iomap.offset + iomap.length < pos + length)
			
 
				+		length = iomap.offset + iomap.length - pos;
			
 
				+
			
 
				+	/*
			
 
				+	 * Now that we have guaranteed that the space allocation will succeed.
			
 
				+	 * we can do the copy-in page by page without having to worry about
			
 
				+	 * failures exposing transient data.
			
 
				+	 */
			
 
				+	written = actor(inode, pos, length, data, &iomap);
			
 
				+
			
 
				+	/*
			
 
				+	 * Now the data has been copied, commit the range we've copied.  This
			
 
				+	 * should not fail unless the filesystem has had a fatal error.
			
 
				+	 */
			
 
				+	ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
			
 
				+			flags, &iomap);
			
 
				+
			
 
				+	return written ? written : ret;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
			
 
				+{
			
 
				+	loff_t i_size = i_size_read(inode);
			
 
				+
			
 
				+	/*
			
 
				+	 * Only truncate newly allocated pages beyoned EOF, even if the
			
 
				+	 * write started inside the existing inode size.
			
 
				+	 */
			
 
				+	if (pos + len > i_size)
			
 
				+		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
			
 
				+		struct page **pagep, struct iomap *iomap)
			
 
				+{
			
 
				+	pgoff_t index = pos >> PAGE_SHIFT;
			
 
				+	struct page *page;
			
 
				+	int status = 0;
			
 
				+
			
 
				+	BUG_ON(pos + len > iomap->offset + iomap->length);
			
 
				+
			
 
				+	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
			
 
				+	if (!page)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	status = __block_write_begin_int(page, pos, len, NULL, iomap);
			
 
				+	if (unlikely(status)) {
			
 
				+		unlock_page(page);
			
 
				+		put_page(page);
			
 
				+		page = NULL;
			
 
				+
			
 
				+		iomap_write_failed(inode, pos, len);
			
 
				+	}
			
 
				+
			
 
				+	*pagep = page;
			
 
				+	return status;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
			
 
				+		unsigned copied, struct page *page)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
			
 
				+			copied, page, NULL);
			
 
				+	if (ret < len)
			
 
				+		iomap_write_failed(inode, pos, len);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static loff_t
			
 
				+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
			
 
				+		struct iomap *iomap)
			
 
				+{
			
 
				+	struct iov_iter *i = data;
			
 
				+	long status = 0;
			
 
				+	ssize_t written = 0;
			
 
				+	unsigned int flags = AOP_FLAG_NOFS;
			
 
				+
			
 
				+	/*
			
 
				+	 * Copies from kernel address space cannot fail (NFSD is a big user).
			
 
				+	 */
			
 
				+	if (!iter_is_iovec(i))
			
 
				+		flags |= AOP_FLAG_UNINTERRUPTIBLE;
			
 
				+
			
 
				+	do {
			
 
				+		struct page *page;
			
 
				+		unsigned long offset;	/* Offset into pagecache page */
			
 
				+		unsigned long bytes;	/* Bytes to write to page */
			
 
				+		size_t copied;		/* Bytes copied from user */
			
 
				+
			
 
				+		offset = (pos & (PAGE_SIZE - 1));
			
 
				+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
			
 
				+						iov_iter_count(i));
			
 
				+again:
			
 
				+		if (bytes > length)
			
 
				+			bytes = length;
			
 
				+
			
 
				+		/*
			
 
				+		 * Bring in the user page that we will copy from _first_.
			
 
				+		 * Otherwise there's a nasty deadlock on copying from the
			
 
				+		 * same page as we're writing to, without it being marked
			
 
				+		 * up-to-date.
			
 
				+		 *
			
 
				+		 * Not only is this an optimisation, but it is also required
			
 
				+		 * to check that the address is actually valid, when atomic
			
 
				+		 * usercopies are used, below.
			
 
				+		 */
			
 
				+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
			
 
				+			status = -EFAULT;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		status = iomap_write_begin(inode, pos, bytes, flags, &page,
			
 
				+				iomap);
			
 
				+		if (unlikely(status))
			
 
				+			break;
			
 
				+
			
 
				+		if (mapping_writably_mapped(inode->i_mapping))
			
 
				+			flush_dcache_page(page);
			
 
				+
			
 
				+		pagefault_disable();
			
 
				+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
			
 
				+		pagefault_enable();
			
 
				+
			
 
				+		flush_dcache_page(page);
			
 
				+		mark_page_accessed(page);
			
 
				+
			
 
				+		status = iomap_write_end(inode, pos, bytes, copied, page);
			
 
				+		if (unlikely(status < 0))
			
 
				+			break;
			
 
				+		copied = status;
			
 
				+
			
 
				+		cond_resched();
			
 
				+
			
 
				+		iov_iter_advance(i, copied);
			
 
				+		if (unlikely(copied == 0)) {
			
 
				+			/*
			
 
				+			 * If we were unable to copy any data at all, we must
			
 
				+			 * fall back to a single segment length write.
			
 
				+			 *
			
 
				+			 * If we didn't fallback here, we could livelock
			
 
				+			 * because not all segments in the iov can be copied at
			
 
				+			 * once without a pagefault.
			
 
				+			 */
			
 
				+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
			
 
				+						iov_iter_single_seg_count(i));
			
 
				+			goto again;
			
 
				+		}
			
 
				+		pos += copied;
			
 
				+		written += copied;
			
 
				+		length -= copied;
			
 
				+
			
 
				+		balance_dirty_pages_ratelimited(inode->i_mapping);
			
 
				+	} while (iov_iter_count(i) && length);
			
 
				+
			
 
				+	return written ? written : status;
			
 
				+}
			
 
				+
			
 
				+ssize_t
			
 
				+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
			
 
				+		struct iomap_ops *ops)
			
 
				+{
			
 
				+	struct inode *inode = iocb->ki_filp->f_mapping->host;
			
 
				+	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
			
 
				+
			
 
				+	while (iov_iter_count(iter)) {
			
 
				+		ret = iomap_apply(inode, pos, iov_iter_count(iter),
			
 
				+				IOMAP_WRITE, ops, iter, iomap_write_actor);
			
 
				+		if (ret <= 0)
			
 
				+			break;
			
 
				+		pos += ret;
			
 
				+		written += ret;
			
 
				+	}
			
 
				+
			
 
				+	return written ? written : ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
			
 
				+
			
 
				+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
			
 
				+		unsigned bytes, struct iomap *iomap)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+	int status;
			
 
				+
			
 
				+	status = iomap_write_begin(inode, pos, bytes,
			
 
				+			AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
			
 
				+	if (status)
			
 
				+		return status;
			
 
				+
			
 
				+	zero_user(page, offset, bytes);
			
 
				+	mark_page_accessed(page);
			
 
				+
			
 
				+	return iomap_write_end(inode, pos, bytes, bytes, page);
			
 
				+}
			
 
				+
			
 
				+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
			
 
				+		struct iomap *iomap)
			
 
				+{
			
 
				+	sector_t sector = iomap->blkno +
			
 
				+		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
			
 
				+
			
 
				+	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
			
 
				+}
			
 
				+
			
 
				+static loff_t
			
 
				+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
			
 
				+		void *data, struct iomap *iomap)
			
 
				+{
			
 
				+	bool *did_zero = data;
			
 
				+	loff_t written = 0;
			
 
				+	int status;
			
 
				+
			
 
				+	/* already zeroed?  we're done. */
			
 
				+	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
			
 
				+	    	return count;
			
 
				+
			
 
				+	do {
			
 
				+		unsigned offset, bytes;
			
 
				+
			
 
				+		offset = pos & (PAGE_SIZE - 1); /* Within page */
			
 
				+		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
			
 
				+
			
 
				+		if (IS_DAX(inode))
			
 
				+			status = iomap_dax_zero(pos, offset, bytes, iomap);
			
 
				+		else
			
 
				+			status = iomap_zero(inode, pos, offset, bytes, iomap);
			
 
				+		if (status < 0)
			
 
				+			return status;
			
 
				+
			
 
				+		pos += bytes;
			
 
				+		count -= bytes;
			
 
				+		written += bytes;
			
 
				+		if (did_zero)
			
 
				+			*did_zero = true;
			
 
				+	} while (count > 0);
			
 
				+
			
 
				+	return written;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
			
 
				+		struct iomap_ops *ops)
			
 
				+{
			
 
				+	loff_t ret;
			
 
				+
			
 
				+	while (len > 0) {
			
 
				+		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
			
 
				+				ops, did_zero, iomap_zero_range_actor);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+
			
 
				+		pos += ret;
			
 
				+		len -= ret;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_zero_range);
			
 
				+
			
 
				+int
			
 
				+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
			
 
				+		struct iomap_ops *ops)
			
 
				+{
			
 
				+	unsigned blocksize = (1 << inode->i_blkbits);
			
 
				+	unsigned off = pos & (blocksize - 1);
			
 
				+
			
 
				+	/* Block boundary? Nothing to do */
			
 
				+	if (!off)
			
 
				+		return 0;
			
 
				+	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_truncate_page);
			
 
				+
			
 
				+static loff_t
			
 
				+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
			
 
				+		void *data, struct iomap *iomap)
			
 
				+{
			
 
				+	struct page *page = data;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
			
 
				+			NULL, iomap);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	block_commit_write(page, 0, length);
			
 
				+	return length;
			
 
				+}
			
 
				+
			
 
				+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
			
 
				+		struct iomap_ops *ops)
			
 
				+{
			
 
				+	struct page *page = vmf->page;
			
 
				+	struct inode *inode = file_inode(vma->vm_file);
			
 
				+	unsigned long length;
			
 
				+	loff_t offset, size;
			
 
				+	ssize_t ret;
			
 
				+
			
 
				+	lock_page(page);
			
 
				+	size = i_size_read(inode);
			
 
				+	if ((page->mapping != inode->i_mapping) ||
			
 
				+	    (page_offset(page) > size)) {
			
 
				+		/* We overload EFAULT to mean page got truncated */
			
 
				+		ret = -EFAULT;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/* page is wholly or partially inside EOF */
			
 
				+	if (((page->index + 1) << PAGE_SHIFT) > size)
			
 
				+		length = size & ~PAGE_MASK;
			
 
				+	else
			
 
				+		length = PAGE_SIZE;
			
 
				+
			
 
				+	offset = page_offset(page);
			
 
				+	while (length > 0) {
			
 
				+		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
			
 
				+				ops, page, iomap_page_mkwrite_actor);
			
 
				+		if (unlikely(ret <= 0))
			
 
				+			goto out_unlock;
			
 
				+		offset += ret;
			
 
				+		length -= ret;
			
 
				+	}
			
 
				+
			
 
				+	set_page_dirty(page);
			
 
				+	wait_for_stable_page(page);
			
 
				+	return 0;
			
 
				+out_unlock:
			
 
				+	unlock_page(page);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
			
 
				+
			
 
				+struct fiemap_ctx {
			
 
				+	struct fiemap_extent_info *fi;
			
 
				+	struct iomap prev;
			
 
				+};
			
 
				+
			
 
				+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
			
 
				+		struct iomap *iomap, u32 flags)
			
 
				+{
			
 
				+	switch (iomap->type) {
			
 
				+	case IOMAP_HOLE:
			
 
				+		/* skip holes */
			
 
				+		return 0;
			
 
				+	case IOMAP_DELALLOC:
			
 
				+		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
			
 
				+		break;
			
 
				+	case IOMAP_UNWRITTEN:
			
 
				+		flags |= FIEMAP_EXTENT_UNWRITTEN;
			
 
				+		break;
			
 
				+	case IOMAP_MAPPED:
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return fiemap_fill_next_extent(fi, iomap->offset,
			
 
				+			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
			
 
				+			iomap->length, flags | FIEMAP_EXTENT_MERGED);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static loff_t
			
 
				+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
			
 
				+		struct iomap *iomap)
			
 
				+{
			
 
				+	struct fiemap_ctx *ctx = data;
			
 
				+	loff_t ret = length;
			
 
				+
			
 
				+	if (iomap->type == IOMAP_HOLE)
			
 
				+		return length;
			
 
				+
			
 
				+	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
			
 
				+	ctx->prev = *iomap;
			
 
				+	switch (ret) {
			
 
				+	case 0:		/* success */
			
 
				+		return length;
			
 
				+	case 1:		/* extent array full */
			
 
				+		return 0;
			
 
				+	default:
			
 
				+		return ret;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
			
 
				+		loff_t start, loff_t len, struct iomap_ops *ops)
			
 
				+{
			
 
				+	struct fiemap_ctx ctx;
			
 
				+	loff_t ret;
			
 
				+
			
 
				+	memset(&ctx, 0, sizeof(ctx));
			
 
				+	ctx.fi = fi;
			
 
				+	ctx.prev.type = IOMAP_HOLE;
			
 
				+
			
 
				+	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = filemap_write_and_wait(inode->i_mapping);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	while (len > 0) {
			
 
				+		ret = iomap_apply(inode, start, len, 0, ops, &ctx,
			
 
				+				iomap_fiemap_actor);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+		if (ret == 0)
			
 
				+			break;
			
 
				+
			
 
				+		start += ret;
			
 
				+		len -= ret;
			
 
				+	}
			
 
				+
			
 
				+	if (ctx.prev.type != IOMAP_HOLE) {
			
 
				+		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_fiemap);
			
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
 
				  * Copyright (c) 2014-2016 Christoph Hellwig.
			
 
				  */
			
 
				 #include <linux/exportfs.h>
			
 
				+#include <linux/iomap.h>
			
 
				 #include <linux/genhd.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/pr.h>
			
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
 
				  */
			
 
				 #include <linux/sunrpc/svc.h>
			
 
				 #include <linux/exportfs.h>
			
 
				+#include <linux/iomap.h>
			
 
				 #include <linux/nfs4.h>
			
 
				 
			
 
				 #include "nfsd.h"
			
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
 
				 	depends on (64BIT || LBDAF)
			
 
				 	select EXPORTFS
			
 
				 	select LIBCRC32C
			
 
				+	select FS_IOMAP
			
 
				 	help
			
 
				 	  XFS is a high performance journaling filesystem which originated
			
 
				 	  on the SGI IRIX platform.  It is completely multi-threaded, can
			
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
 
				  * Lookup the first record less than or equal to [bno, len]
			
 
				  * in the btree given by cur.
			
 
				  */
			
 
				-int					/* error */
			
 
				+static int				/* error */
			
 
				 xfs_alloc_lookup_le(
			
 
				 	struct xfs_btree_cur	*cur,	/* btree cursor */
			
 
				 	xfs_agblock_t		bno,	/* starting block of extent */
			
@@ -1839,19 +1839,8 @@ void
 
				 xfs_alloc_compute_maxlevels(
			
 
				 	xfs_mount_t	*mp)	/* file system mount structure */
			
 
				 {
			
 
				-	int		level;
			
 
				-	uint		maxblocks;
			
 
				-	uint		maxleafents;
			
 
				-	int		minleafrecs;
			
 
				-	int		minnoderecs;
			
 
				-
			
 
				-	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
			
 
				-	minleafrecs = mp->m_alloc_mnr[0];
			
 
				-	minnoderecs = mp->m_alloc_mnr[1];
			
 
				-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
			
 
				-	for (level = 1; maxblocks > 1; level++)
			
 
				-		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
			
 
				-	mp->m_ag_maxlevels = level;
			
 
				+	mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
			
 
				+			(mp->m_sb.sb_agblocks + 1) / 2);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2658,55 +2647,79 @@ error0:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Free an extent.
			
 
				- * Just break up the extent address and hand off to xfs_free_ag_extent
			
 
				- * after fixing up the freelist.
			
 
				- */
			
 
				-int				/* error */
			
 
				-xfs_free_extent(
			
 
				-	xfs_trans_t	*tp,	/* transaction pointer */
			
 
				-	xfs_fsblock_t	bno,	/* starting block number of extent */
			
 
				-	xfs_extlen_t	len)	/* length of extent */
			
 
				+/* Ensure that the freelist is at full capacity. */
			
 
				+int
			
 
				+xfs_free_extent_fix_freelist(
			
 
				+	struct xfs_trans	*tp,
			
 
				+	xfs_agnumber_t		agno,
			
 
				+	struct xfs_buf		**agbp)
			
 
				 {
			
 
				-	xfs_alloc_arg_t	args;
			
 
				-	int		error;
			
 
				+	struct xfs_alloc_arg	args;
			
 
				+	int			error;
			
 
				 
			
 
				-	ASSERT(len != 0);
			
 
				-	memset(&args, 0, sizeof(xfs_alloc_arg_t));
			
 
				+	memset(&args, 0, sizeof(struct xfs_alloc_arg));
			
 
				 	args.tp = tp;
			
 
				 	args.mp = tp->t_mountp;
			
 
				+	args.agno = agno;
			
 
				 
			
 
				 	/*
			
 
				 	 * validate that the block number is legal - the enables us to detect
			
 
				 	 * and handle a silent filesystem corruption rather than crashing.
			
 
				 	 */
			
 
				-	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
			
 
				 	if (args.agno >= args.mp->m_sb.sb_agcount)
			
 
				 		return -EFSCORRUPTED;
			
 
				 
			
 
				-	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
			
 
				-	if (args.agbno >= args.mp->m_sb.sb_agblocks)
			
 
				-		return -EFSCORRUPTED;
			
 
				-
			
 
				 	args.pag = xfs_perag_get(args.mp, args.agno);
			
 
				 	ASSERT(args.pag);
			
 
				 
			
 
				 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
			
 
				 	if (error)
			
 
				-		goto error0;
			
 
				+		goto out;
			
 
				+
			
 
				+	*agbp = args.agbp;
			
 
				+out:
			
 
				+	xfs_perag_put(args.pag);
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Free an extent.
			
 
				+ * Just break up the extent address and hand off to xfs_free_ag_extent
			
 
				+ * after fixing up the freelist.
			
 
				+ */
			
 
				+int				/* error */
			
 
				+xfs_free_extent(
			
 
				+	struct xfs_trans	*tp,	/* transaction pointer */
			
 
				+	xfs_fsblock_t		bno,	/* starting block number of extent */
			
 
				+	xfs_extlen_t		len)	/* length of extent */
			
 
				+{
			
 
				+	struct xfs_mount	*mp = tp->t_mountp;
			
 
				+	struct xfs_buf		*agbp;
			
 
				+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, bno);
			
 
				+	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, bno);
			
 
				+	int			error;
			
 
				+
			
 
				+	ASSERT(len != 0);
			
 
				+
			
 
				+	error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+
			
 
				+	XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
			
 
				 
			
 
				 	/* validate the extent size is legal now we have the agf locked */
			
 
				-	if (args.agbno + len >
			
 
				-			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
			
 
				-		error = -EFSCORRUPTED;
			
 
				-		goto error0;
			
 
				-	}
			
 
				+	XFS_WANT_CORRUPTED_GOTO(mp,
			
 
				+		agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
			
 
				+				err);
			
 
				 
			
 
				-	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
			
 
				-	if (!error)
			
 
				-		xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
			
 
				-error0:
			
 
				-	xfs_perag_put(args.pag);
			
 
				+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
			
 
				+	if (error)
			
 
				+		goto err;
			
 
				+
			
 
				+	xfs_extent_busy_insert(tp, agno, agbno, len, 0);
			
 
				+	return 0;
			
 
				+
			
 
				+err:
			
 
				+	xfs_trans_brelse(tp, agbp);
			
 
				 	return error;
			
 
				 }
			
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -212,13 +212,6 @@ xfs_free_extent(
 
				 	xfs_fsblock_t	bno,	/* starting block number of extent */
			
 
				 	xfs_extlen_t	len);	/* length of extent */
			
 
				 
			
 
				-int					/* error */
			
 
				-xfs_alloc_lookup_le(
			
 
				-	struct xfs_btree_cur	*cur,	/* btree cursor */
			
 
				-	xfs_agblock_t		bno,	/* starting block of extent */
			
 
				-	xfs_extlen_t		len,	/* length of extent */
			
 
				-	int			*stat);	/* success/failure */
			
 
				-
			
 
				 int				/* error */
			
 
				 xfs_alloc_lookup_ge(
			
 
				 	struct xfs_btree_cur	*cur,	/* btree cursor */
			
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
 
				 int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
			
 
				 			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
			
 
				 int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
			
 
				+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
			
 
				+		struct xfs_buf **agbp);
			
 
				 
			
 
				 #endif	/* __XFS_ALLOC_H__ */
			
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 
				 int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
			
 
				 int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
			
 
				 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
			
 
				-int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
			
 
				 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
			
 
				 int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
			
 
				 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
			
@@ -88,8 +87,6 @@ int	xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
 
				 void	xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
			
 
				 				       struct xfs_da_state_blk *drop_blk,
			
 
				 				       struct xfs_da_state_blk *save_blk);
			
 
				-int	xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
			
 
				-
			
 
				 /*
			
 
				  * Utility routines.
			
 
				  */
			
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
 
				  */
			
 
				 void
			
 
				 xfs_bmap_add_free(
			
 
				+	struct xfs_mount	*mp,		/* mount point structure */
			
 
				+	struct xfs_bmap_free	*flist,		/* list of extents */
			
 
				 	xfs_fsblock_t		bno,		/* fs block number of extent */
			
 
				-	xfs_filblks_t		len,		/* length of extent */
			
 
				-	xfs_bmap_free_t		*flist,		/* list of extents */
			
 
				-	xfs_mount_t		*mp)		/* mount point structure */
			
 
				+	xfs_filblks_t		len)		/* length of extent */
			
 
				 {
			
 
				-	xfs_bmap_free_item_t	*cur;		/* current (next) element */
			
 
				-	xfs_bmap_free_item_t	*new;		/* new element */
			
 
				-	xfs_bmap_free_item_t	*prev;		/* previous element */
			
 
				+	struct xfs_bmap_free_item	*new;		/* new element */
			
 
				 #ifdef DEBUG
			
 
				 	xfs_agnumber_t		agno;
			
 
				 	xfs_agblock_t		agbno;
			
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
 
				 	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
			
 
				 	new->xbfi_startblock = bno;
			
 
				 	new->xbfi_blockcount = (xfs_extlen_t)len;
			
 
				-	for (prev = NULL, cur = flist->xbf_first;
			
 
				-	     cur != NULL;
			
 
				-	     prev = cur, cur = cur->xbfi_next) {
			
 
				-		if (cur->xbfi_startblock >= bno)
			
 
				-			break;
			
 
				-	}
			
 
				-	if (prev)
			
 
				-		prev->xbfi_next = new;
			
 
				-	else
			
 
				-		flist->xbf_first = new;
			
 
				-	new->xbfi_next = cur;
			
 
				+	list_add(&new->xbfi_list, &flist->xbf_flist);
			
 
				 	flist->xbf_count++;
			
 
				 }
			
 
				 
			
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
 
				  */
			
 
				 void
			
 
				 xfs_bmap_del_free(
			
 
				-	xfs_bmap_free_t		*flist,	/* free item list header */
			
 
				-	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
			
 
				-	xfs_bmap_free_item_t	*free)	/* list item to be freed */
			
 
				+	struct xfs_bmap_free		*flist,	/* free item list header */
			
 
				+	struct xfs_bmap_free_item	*free)	/* list item to be freed */
			
 
				 {
			
 
				-	if (prev)
			
 
				-		prev->xbfi_next = free->xbfi_next;
			
 
				-	else
			
 
				-		flist->xbf_first = free->xbfi_next;
			
 
				+	list_del(&free->xbfi_list);
			
 
				 	flist->xbf_count--;
			
 
				 	kmem_zone_free(xfs_bmap_free_item_zone, free);
			
 
				 }
			
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
 
				  */
			
 
				 void
			
 
				 xfs_bmap_cancel(
			
 
				-	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
			
 
				+	struct xfs_bmap_free		*flist)	/* list of bmap_free_items */
			
 
				 {
			
 
				-	xfs_bmap_free_item_t	*free;	/* free list item */
			
 
				-	xfs_bmap_free_item_t	*next;
			
 
				+	struct xfs_bmap_free_item	*free;	/* free list item */
			
 
				 
			
 
				 	if (flist->xbf_count == 0)
			
 
				 		return;
			
 
				-	ASSERT(flist->xbf_first != NULL);
			
 
				-	for (free = flist->xbf_first; free; free = next) {
			
 
				-		next = free->xbfi_next;
			
 
				-		xfs_bmap_del_free(flist, NULL, free);
			
 
				+	while (!list_empty(&flist->xbf_flist)) {
			
 
				+		free = list_first_entry(&flist->xbf_flist,
			
 
				+				struct xfs_bmap_free_item, xbfi_list);
			
 
				+		xfs_bmap_del_free(flist, free);
			
 
				 	}
			
 
				 	ASSERT(flist->xbf_count == 0);
			
 
				 }
			
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
 
				 	cblock = XFS_BUF_TO_BLOCK(cbp);
			
 
				 	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
			
 
				 		return error;
			
 
				-	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
			
 
				+	xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
			
 
				 	ip->i_d.di_nblocks--;
			
 
				 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
			
 
				 	xfs_trans_binval(tp, cbp);
			
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
 
				 	 * If we need to, add to list of extents to delete.
			
 
				 	 */
			
 
				 	if (do_fx)
			
 
				-		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
			
 
				-			mp);
			
 
				+		xfs_bmap_add_free(mp, flist, del->br_startblock,
			
 
				+			del->br_blockcount);
			
 
				 	/*
			
 
				 	 * Adjust inode # blocks in the file.
			
 
				 	 */
			
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
 
				  * List of extents to be free "later".
			
 
				  * The list is kept sorted on xbf_startblock.
			
 
				  */
			
 
				-typedef struct xfs_bmap_free_item
			
 
				+struct xfs_bmap_free_item
			
 
				 {
			
 
				 	xfs_fsblock_t		xbfi_startblock;/* starting fs block number */
			
 
				 	xfs_extlen_t		xbfi_blockcount;/* number of blocks in extent */
			
 
				-	struct xfs_bmap_free_item *xbfi_next;	/* link to next entry */
			
 
				-} xfs_bmap_free_item_t;
			
 
				+	struct list_head	xbfi_list;
			
 
				+};
			
 
				 
			
 
				 /*
			
 
				  * Header for free extent list.
			
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
 
				  */
			
 
				 typedef	struct xfs_bmap_free
			
 
				 {
			
 
				-	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
			
 
				+	struct list_head	xbf_flist;	/* list of to-be-free extents */
			
 
				 	int			xbf_count;	/* count of items on list */
			
 
				 	int			xbf_low;	/* alloc in low mode */
			
 
				 } xfs_bmap_free_t;
			
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
 
				 
			
 
				 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
			
 
				 {
			
 
				-	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
			
 
				-		(flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
			
 
				+	INIT_LIST_HEAD(&flp->xbf_flist);
			
 
				+	flp->xbf_count = 0;
			
 
				+	flp->xbf_low = 0;
			
 
				+	*fbp = NULLFSBLOCK;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -191,8 +193,8 @@ void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 
				 
			
 
				 int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
			
 
				 void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
			
 
				-void	xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
			
 
				-		struct xfs_bmap_free *flist, struct xfs_mount *mp);
			
 
				+void	xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
			
 
				+			  xfs_fsblock_t bno, xfs_filblks_t len);
			
 
				 void	xfs_bmap_cancel(struct xfs_bmap_free *flist);
			
 
				 int	xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
			
 
				 			struct xfs_inode *ip);
			
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
 
				 	struct xfs_trans	*tp = cur->bc_tp;
			
 
				 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
			
 
				 
			
 
				-	xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
			
 
				+	xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
			
 
				 	ip->i_d.di_nblocks--;
			
 
				 
			
 
				 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
			
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
 
				  */
			
 
				 STATIC struct xfs_btree_block *
			
 
				 xfs_btree_get_iroot(
			
 
				-       struct xfs_btree_cur    *cur)
			
 
				+	struct xfs_btree_cur	*cur)
			
 
				 {
			
 
				-       struct xfs_ifork        *ifp;
			
 
				+	struct xfs_ifork	*ifp;
			
 
				 
			
 
				-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
			
 
				-       return (struct xfs_btree_block *)ifp->if_broot;
			
 
				+	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
			
 
				+	return (struct xfs_btree_block *)ifp->if_broot;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
 
				 
			
 
				 	return true;
			
 
				 }
			
 
				+
			
 
				+/*
			
 
				+ * Calculate the number of btree levels needed to store a given number of
			
 
				+ * records in a short-format btree.
			
 
				+ */
			
 
				+uint
			
 
				+xfs_btree_compute_maxlevels(
			
 
				+	struct xfs_mount	*mp,
			
 
				+	uint			*limits,
			
 
				+	unsigned long		len)
			
 
				+{
			
 
				+	uint			level;
			
 
				+	unsigned long		maxblocks;
			
 
				+
			
 
				+	maxblocks = (len + limits[0] - 1) / limits[0];
			
 
				+	for (level = 1; maxblocks > 1; level++)
			
 
				+		maxblocks = (maxblocks + limits[1] - 1) / limits[1];
			
 
				+	return level;
			
 
				+}
			
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
 
				 
			
 
				 bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
			
 
				 bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
			
 
				+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
			
 
				+				 unsigned long len);
			
 
				 
			
 
				 #endif	/* __XFS_BTREE_H__ */
			
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
 
				 	struct xfs_da_state_blk	*newblk;
			
 
				 	struct xfs_da_state_blk	*addblk;
			
 
				 	struct xfs_da_intnode	*node;
			
 
				-	struct xfs_buf		*bp;
			
 
				 	int			max;
			
 
				 	int			action = 0;
			
 
				 	int			error;
			
@@ -397,7 +396,9 @@ xfs_da3_split(
 
				 				break;
			
 
				 			}
			
 
				 			/*
			
 
				-			 * Entry wouldn't fit, split the leaf again.
			
 
				+			 * Entry wouldn't fit, split the leaf again. The new
			
 
				+			 * extrablk will be consumed by xfs_da3_node_split if
			
 
				+			 * the node is split.
			
 
				 			 */
			
 
				 			state->extravalid = 1;
			
 
				 			if (state->inleaf) {
			
@@ -445,6 +446,14 @@ xfs_da3_split(
 
				 	if (!addblk)
			
 
				 		return 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * xfs_da3_node_split() should have consumed any extra blocks we added
			
 
				+	 * during a double leaf split in the attr fork. This is guaranteed as
			
 
				+	 * we can't be here if the attr fork only has a single leaf block.
			
 
				+	 */
			
 
				+	ASSERT(state->extravalid == 0 ||
			
 
				+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
			
 
				+
			
 
				 	/*
			
 
				 	 * Split the root node.
			
 
				 	 */
			
@@ -457,43 +466,33 @@ xfs_da3_split(
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * Update pointers to the node which used to be block 0 and
			
 
				-	 * just got bumped because of the addition of a new root node.
			
 
				-	 * There might be three blocks involved if a double split occurred,
			
 
				-	 * and the original block 0 could be at any position in the list.
			
 
				+	 * Update pointers to the node which used to be block 0 and just got
			
 
				+	 * bumped because of the addition of a new root node.  Note that the
			
 
				+	 * original block 0 could be at any position in the list of blocks in
			
 
				+	 * the tree.
			
 
				 	 *
			
 
				-	 * Note: the magic numbers and sibling pointers are in the same
			
 
				-	 * physical place for both v2 and v3 headers (by design). Hence it
			
 
				-	 * doesn't matter which version of the xfs_da_intnode structure we use
			
 
				-	 * here as the result will be the same using either structure.
			
 
				+	 * Note: the magic numbers and sibling pointers are in the same physical
			
 
				+	 * place for both v2 and v3 headers (by design). Hence it doesn't matter
			
 
				+	 * which version of the xfs_da_intnode structure we use here as the
			
 
				+	 * result will be the same using either structure.
			
 
				 	 */
			
 
				 	node = oldblk->bp->b_addr;
			
 
				 	if (node->hdr.info.forw) {
			
 
				-		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
			
 
				-			bp = addblk->bp;
			
 
				-		} else {
			
 
				-			ASSERT(state->extravalid);
			
 
				-			bp = state->extrablk.bp;
			
 
				-		}
			
 
				-		node = bp->b_addr;
			
 
				+		ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
			
 
				+		node = addblk->bp->b_addr;
			
 
				 		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
			
 
				-		xfs_trans_log_buf(state->args->trans, bp,
			
 
				-		    XFS_DA_LOGRANGE(node, &node->hdr.info,
			
 
				-		    sizeof(node->hdr.info)));
			
 
				+		xfs_trans_log_buf(state->args->trans, addblk->bp,
			
 
				+				  XFS_DA_LOGRANGE(node, &node->hdr.info,
			
 
				+				  sizeof(node->hdr.info)));
			
 
				 	}
			
 
				 	node = oldblk->bp->b_addr;
			
 
				 	if (node->hdr.info.back) {
			
 
				-		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
			
 
				-			bp = addblk->bp;
			
 
				-		} else {
			
 
				-			ASSERT(state->extravalid);
			
 
				-			bp = state->extrablk.bp;
			
 
				-		}
			
 
				-		node = bp->b_addr;
			
 
				+		ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
			
 
				+		node = addblk->bp->b_addr;
			
 
				 		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
			
 
				-		xfs_trans_log_buf(state->args->trans, bp,
			
 
				-		    XFS_DA_LOGRANGE(node, &node->hdr.info,
			
 
				-		    sizeof(node->hdr.info)));
			
 
				+		xfs_trans_log_buf(state->args->trans, addblk->bp,
			
 
				+				  XFS_DA_LOGRANGE(node, &node->hdr.info,
			
 
				+				  sizeof(node->hdr.info)));
			
 
				 	}
			
 
				 	addblk->bp = NULL;
			
 
				 	return 0;
			
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
 
				 	int count = sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */
			
 
				 
			
 
				 	count += len;					/* name */
			
 
				-	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
			
 
				-				sizeof(xfs_dir2_ino4_t); /* ino # */
			
 
				+	count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
			
 
				 	return count;
			
 
				 }
			
 
				 
			
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
 
				 static xfs_ino_t
			
 
				 xfs_dir2_sf_get_ino(
			
 
				 	struct xfs_dir2_sf_hdr	*hdr,
			
 
				-	xfs_dir2_inou_t		*from)
			
 
				+	__uint8_t		*from)
			
 
				 {
			
 
				 	if (hdr->i8count)
			
 
				-		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
			
 
				+		return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
			
 
				 	else
			
 
				-		return get_unaligned_be32(&from->i4.i);
			
 
				+		return get_unaligned_be32(from);
			
 
				 }
			
 
				 
			
 
				 static void
			
 
				 xfs_dir2_sf_put_ino(
			
 
				 	struct xfs_dir2_sf_hdr	*hdr,
			
 
				-	xfs_dir2_inou_t		*to,
			
 
				+	__uint8_t		*to,
			
 
				 	xfs_ino_t		ino)
			
 
				 {
			
 
				 	ASSERT((ino & 0xff00000000000000ULL) == 0);
			
 
				 
			
 
				 	if (hdr->i8count)
			
 
				-		put_unaligned_be64(ino, &to->i8.i);
			
 
				+		put_unaligned_be64(ino, to);
			
 
				 	else
			
 
				-		put_unaligned_be32(ino, &to->i4.i);
			
 
				+		put_unaligned_be32(ino, to);
			
 
				 }
			
 
				 
			
 
				 static xfs_ino_t
			
 
				 xfs_dir2_sf_get_parent_ino(
			
 
				 	struct xfs_dir2_sf_hdr	*hdr)
			
 
				 {
			
 
				-	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
			
 
				+	return xfs_dir2_sf_get_ino(hdr, hdr->parent);
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
 
				 	struct xfs_dir2_sf_hdr	*hdr,
			
 
				 	xfs_ino_t		ino)
			
 
				 {
			
 
				-	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
			
 
				+	xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
 
				 	struct xfs_dir2_sf_hdr	*hdr,
			
 
				 	struct xfs_dir2_sf_entry *sfep)
			
 
				 {
			
 
				-	return xfs_dir2_sf_get_ino(hdr,
			
 
				-				(xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
			
 
				+	return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
 
				 	struct xfs_dir2_sf_entry *sfep,
			
 
				 	xfs_ino_t		ino)
			
 
				 {
			
 
				-	xfs_dir2_sf_put_ino(hdr,
			
 
				-			    (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
			
 
				+	xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
			
 
				 }
			
 
				 
			
 
				 static xfs_ino_t
			
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
 
				 	struct xfs_dir2_sf_hdr	*hdr,
			
 
				 	struct xfs_dir2_sf_entry *sfep)
			
 
				 {
			
 
				-	return xfs_dir2_sf_get_ino(hdr,
			
 
				-			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
			
 
				+	return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
 
				 	struct xfs_dir2_sf_entry *sfep,
			
 
				 	xfs_ino_t		ino)
			
 
				 {
			
 
				-	xfs_dir2_sf_put_ino(hdr,
			
 
				-			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
			
 
				+	xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -191,12 +191,6 @@ typedef	__uint16_t	xfs_dir2_data_off_t;
 
				 #define	NULLDATAOFF	0xffffU
			
 
				 typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
			
 
				 
			
 
				-/*
			
 
				- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
			
 
				- * Only need 16 bits, this is the byte offset into the single block form.
			
 
				- */
			
 
				-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
			
 
				-
			
 
				 /*
			
 
				  * Offset in data space of a data entry.
			
 
				  */
			
@@ -214,22 +208,10 @@ typedef	xfs_off_t	xfs_dir2_off_t;
 
				  */
			
 
				 typedef	__uint32_t	xfs_dir2_db_t;
			
 
				 
			
 
				-/*
			
 
				- * Inode number stored as 8 8-bit values.
			
 
				- */
			
 
				-typedef	struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
			
 
				-
			
 
				-/*
			
 
				- * Inode number stored as 4 8-bit values.
			
 
				- * Works a lot of the time, when all the inode numbers in a directory
			
 
				- * fit in 32 bits.
			
 
				- */
			
 
				-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
			
 
				+#define XFS_INO32_SIZE	4
			
 
				+#define XFS_INO64_SIZE	8
			
 
				+#define XFS_INO64_DIFF	(XFS_INO64_SIZE - XFS_INO32_SIZE)
			
 
				 
			
 
				-typedef union {
			
 
				-	xfs_dir2_ino8_t	i8;
			
 
				-	xfs_dir2_ino4_t	i4;
			
 
				-} xfs_dir2_inou_t;
			
 
				 #define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)
			
 
				 
			
 
				 /*
			
@@ -246,39 +228,38 @@ typedef union {
 
				 typedef struct xfs_dir2_sf_hdr {
			
 
				 	__uint8_t		count;		/* count of entries */
			
 
				 	__uint8_t		i8count;	/* count of 8-byte inode #s */
			
 
				-	xfs_dir2_inou_t		parent;		/* parent dir inode number */
			
 
				-} __arch_pack xfs_dir2_sf_hdr_t;
			
 
				+	__uint8_t		parent[8];	/* parent dir inode number */
			
 
				+} __packed xfs_dir2_sf_hdr_t;
			
 
				 
			
 
				 typedef struct xfs_dir2_sf_entry {
			
 
				 	__u8			namelen;	/* actual name length */
			
 
				-	xfs_dir2_sf_off_t	offset;		/* saved offset */
			
 
				+	__u8			offset[2];	/* saved offset */
			
 
				 	__u8			name[];		/* name, variable size */
			
 
				 	/*
			
 
				 	 * A single byte containing the file type field follows the inode
			
 
				 	 * number for version 3 directory entries.
			
 
				 	 *
			
 
				-	 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
			
 
				-	 * variable offset after the name.
			
 
				+	 * A 64-bit or 32-bit inode number follows here, at a variable offset
			
 
				+	 * after the name.
			
 
				 	 */
			
 
				-} __arch_pack xfs_dir2_sf_entry_t;
			
 
				+} xfs_dir2_sf_entry_t;
			
 
				 
			
 
				 static inline int xfs_dir2_sf_hdr_size(int i8count)
			
 
				 {
			
 
				 	return sizeof(struct xfs_dir2_sf_hdr) -
			
 
				-		(i8count == 0) *
			
 
				-		(sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
			
 
				+		(i8count == 0) * XFS_INO64_DIFF;
			
 
				 }
			
 
				 
			
 
				 static inline xfs_dir2_data_aoff_t
			
 
				 xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
			
 
				 {
			
 
				-	return get_unaligned_be16(&sfep->offset.i);
			
 
				+	return get_unaligned_be16(sfep->offset);
			
 
				 }
			
 
				 
			
 
				 static inline void
			
 
				 xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
			
 
				 {
			
 
				-	put_unaligned_be16(off, &sfep->offset.i);
			
 
				+	put_unaligned_be16(off, sfep->offset);
			
 
				 }
			
 
				 
			
 
				 static inline struct xfs_dir2_sf_entry *
			
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
 
				 		/*
			
 
				 		 * Calculate the new size, see if we should give up yet.
			
 
				 		 */
			
 
				-		size = xfs_dir2_sf_hdr_size(i8count) +		/* header */
			
 
				-		       count +					/* namelen */
			
 
				-		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
			
 
				-		       namelen +				/* name */
			
 
				-		       (i8count ?				/* inumber */
			
 
				-				(uint)sizeof(xfs_dir2_ino8_t) * count :
			
 
				-				(uint)sizeof(xfs_dir2_ino4_t) * count);
			
 
				+		size = xfs_dir2_sf_hdr_size(i8count) +	/* header */
			
 
				+		       count * 3 * sizeof(u8) +		/* namelen + offset */
			
 
				+		       namelen +			/* name */
			
 
				+		       (i8count ?			/* inumber */
			
 
				+				count * XFS_INO64_SIZE :
			
 
				+				count * XFS_INO32_SIZE);
			
 
				 		if (size > XFS_IFORK_DSIZE(dp))
			
 
				 			return size;		/* size value is a failure */
			
 
				 	}
			
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
 
				 		/*
			
 
				 		 * Yes, adjust the inode size.  old count + (parent + new)
			
 
				 		 */
			
 
				-		incr_isize +=
			
 
				-			(sfp->count + 2) *
			
 
				-			((uint)sizeof(xfs_dir2_ino8_t) -
			
 
				-			 (uint)sizeof(xfs_dir2_ino4_t));
			
 
				+		incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
			
 
				 		objchange = 1;
			
 
				 	}
			
 
				 
			
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
 
				 		int	error;			/* error return value */
			
 
				 		int	newsize;		/* new inode size */
			
 
				 
			
 
				-		newsize =
			
 
				-			dp->i_df.if_bytes +
			
 
				-			(sfp->count + 1) *
			
 
				-			((uint)sizeof(xfs_dir2_ino8_t) -
			
 
				-			 (uint)sizeof(xfs_dir2_ino4_t));
			
 
				+		newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
			
 
				 		/*
			
 
				 		 * Won't fit as shortform, convert to block then do replace.
			
 
				 		 */
			
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
 
				 	/*
			
 
				 	 * Compute the new inode size.
			
 
				 	 */
			
 
				-	newsize =
			
 
				-		oldsize -
			
 
				-		(oldsfp->count + 1) *
			
 
				-		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
			
 
				+	newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
			
 
				 	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
			
 
				 	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
			
 
				 	/*
			
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
 
				 	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
			
 
				 		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
			
 
				 		sfep->namelen = oldsfep->namelen;
			
 
				-		sfep->offset = oldsfep->offset;
			
 
				+		memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
			
 
				 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
			
 
				 		dp->d_ops->sf_put_ino(sfp, sfep,
			
 
				 				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
			
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
 
				 	/*
			
 
				 	 * Compute the new inode size (nb: entry count + 1 for parent)
			
 
				 	 */
			
 
				-	newsize =
			
 
				-		oldsize +
			
 
				-		(oldsfp->count + 1) *
			
 
				-		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
			
 
				+	newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
			
 
				 	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
			
 
				 	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
			
 
				 	/*
			
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
 
				 	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
			
 
				 		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
			
 
				 		sfep->namelen = oldsfep->namelen;
			
 
				-		sfep->offset = oldsfep->offset;
			
 
				+		memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
			
 
				 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
			
 
				 		dp->d_ops->sf_put_ino(sfp, sfep,
			
 
				 				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
			
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 
				  * with the crc feature bit, and all accesses to them must be conditional on
			
 
				  * that flag.
			
 
				  */
			
 
				+/* short form block header */
			
 
				+struct xfs_btree_block_shdr {
			
 
				+	__be32		bb_leftsib;
			
 
				+	__be32		bb_rightsib;
			
 
				+
			
 
				+	__be64		bb_blkno;
			
 
				+	__be64		bb_lsn;
			
 
				+	uuid_t		bb_uuid;
			
 
				+	__be32		bb_owner;
			
 
				+	__le32		bb_crc;
			
 
				+};
			
 
				+
			
 
				+/* long form block header */
			
 
				+struct xfs_btree_block_lhdr {
			
 
				+	__be64		bb_leftsib;
			
 
				+	__be64		bb_rightsib;
			
 
				+
			
 
				+	__be64		bb_blkno;
			
 
				+	__be64		bb_lsn;
			
 
				+	uuid_t		bb_uuid;
			
 
				+	__be64		bb_owner;
			
 
				+	__le32		bb_crc;
			
 
				+	__be32		bb_pad; /* padding for alignment */
			
 
				+};
			
 
				+
			
 
				 struct xfs_btree_block {
			
 
				 	__be32		bb_magic;	/* magic number for block type */
			
 
				 	__be16		bb_level;	/* 0 is a leaf */
			
 
				 	__be16		bb_numrecs;	/* current # of data records */
			
 
				 	union {
			
 
				-		struct {
			
 
				-			__be32		bb_leftsib;
			
 
				-			__be32		bb_rightsib;
			
 
				-
			
 
				-			__be64		bb_blkno;
			
 
				-			__be64		bb_lsn;
			
 
				-			uuid_t		bb_uuid;
			
 
				-			__be32		bb_owner;
			
 
				-			__le32		bb_crc;
			
 
				-		} s;			/* short form pointers */
			
 
				-		struct	{
			
 
				-			__be64		bb_leftsib;
			
 
				-			__be64		bb_rightsib;
			
 
				-
			
 
				-			__be64		bb_blkno;
			
 
				-			__be64		bb_lsn;
			
 
				-			uuid_t		bb_uuid;
			
 
				-			__be64		bb_owner;
			
 
				-			__le32		bb_crc;
			
 
				-			__be32		bb_pad; /* padding for alignment */
			
 
				-		} l;			/* long form pointers */
			
 
				+		struct xfs_btree_block_shdr s;
			
 
				+		struct xfs_btree_block_lhdr l;
			
 
				 	} bb_u;				/* rest */
			
 
				 };
			
 
				 
			
 
				-#define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
			
 
				-#define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
			
 
				+/* size of a short form block */
			
 
				+#define XFS_BTREE_SBLOCK_LEN \
			
 
				+	(offsetof(struct xfs_btree_block, bb_u) + \
			
 
				+	 offsetof(struct xfs_btree_block_shdr, bb_blkno))
			
 
				+/* size of a long form block */
			
 
				+#define XFS_BTREE_LBLOCK_LEN \
			
 
				+	(offsetof(struct xfs_btree_block, bb_u) + \
			
 
				+	 offsetof(struct xfs_btree_block_lhdr, bb_blkno))
			
 
				 
			
 
				 /* sizes of CRC enabled btree blocks */
			
 
				-#define XFS_BTREE_SBLOCK_CRC_LEN	(XFS_BTREE_SBLOCK_LEN + 40)
			
 
				-#define XFS_BTREE_LBLOCK_CRC_LEN	(XFS_BTREE_LBLOCK_LEN + 48)
			
 
				+#define XFS_BTREE_SBLOCK_CRC_LEN \
			
 
				+	(offsetof(struct xfs_btree_block, bb_u) + \
			
 
				+	 sizeof(struct xfs_btree_block_shdr))
			
 
				+#define XFS_BTREE_LBLOCK_CRC_LEN \
			
 
				+	(offsetof(struct xfs_btree_block, bb_u) + \
			
 
				+	 sizeof(struct xfs_btree_block_lhdr))
			
 
				 
			
 
				 #define XFS_BTREE_SBLOCK_CRC_OFF \
			
 
				 	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
			
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
 
				 #define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
			
 
				 /*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
			
 
				 
			
 
				-/*	XFS_IOC_FREEZE		  -- FIFREEZE   119	 */
			
 
				-/*	XFS_IOC_THAW		  -- FITHAW     120	 */
			
 
				-#ifndef FIFREEZE
			
 
				-#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)
			
 
				-#define XFS_IOC_THAW		     _IOWR('X', 120, int)
			
 
				-#endif
			
 
				+#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)	/* aka FIFREEZE */
			
 
				+#define XFS_IOC_THAW		     _IOWR('X', 120, int)	/* aka FITHAW */
			
 
				 
			
 
				 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
			
 
				 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
			
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
 
				 
			
 
				 	if (!xfs_inobt_issparse(rec->ir_holemask)) {
			
 
				 		/* not sparse, calculate extent info directly */
			
 
				-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
			
 
				-				  XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
			
 
				-				  mp->m_ialloc_blks, flist, mp);
			
 
				+		xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
			
 
				+				  mp->m_ialloc_blks);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
 
				 
			
 
				 		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
			
 
				 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
			
 
				-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
			
 
				-				  flist, mp);
			
 
				+		xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
			
 
				+				  contigblk);
			
 
				 
			
 
				 		/* reset range to current bit and carry on... */
			
 
				 		startidx = endidx = nextbit;
			
@@ -2395,20 +2394,11 @@ void
 
				 xfs_ialloc_compute_maxlevels(
			
 
				 	xfs_mount_t	*mp)		/* file system mount structure */
			
 
				 {
			
 
				-	int		level;
			
 
				-	uint		maxblocks;
			
 
				-	uint		maxleafents;
			
 
				-	int		minleafrecs;
			
 
				-	int		minnoderecs;
			
 
				-
			
 
				-	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
			
 
				-		XFS_INODES_PER_CHUNK_LOG;
			
 
				-	minleafrecs = mp->m_inobt_mnr[0];
			
 
				-	minnoderecs = mp->m_inobt_mnr[1];
			
 
				-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
			
 
				-	for (level = 1; maxblocks > 1; level++)
			
 
				-		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
			
 
				-	mp->m_in_maxlevels = level;
			
 
				+	uint		inodes;
			
 
				+
			
 
				+	inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
			
 
				+	mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
			
 
				+							 inodes);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
 
				  * Get a buffer for the bitmap or summary file block specified.
			
 
				  * The buffer is returned read and locked.
			
 
				  */
			
 
				-int
			
 
				+static int
			
 
				 xfs_rtbuf_get(
			
 
				 	xfs_mount_t	*mp,		/* file system mount structure */
			
 
				 	xfs_trans_t	*tp,		/* transaction pointer */
			
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
 
				  * We're now finished for good with this page.  Update the page state via the
			
 
				  * associated buffer_heads, paying attention to the start and end offsets that
			
 
				  * we need to process on the page.
			
 
				+ *
			
 
				+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
			
 
				+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
			
 
				+ * the page at all, as we may be racing with memory reclaim and it can free both
			
 
				+ * the bufferhead chain and the page as it will see the page as clean and
			
 
				+ * unused.
			
 
				  */
			
 
				 static void
			
 
				 xfs_finish_page_writeback(
			
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
 
				 	int			error)
			
 
				 {
			
 
				 	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
			
 
				-	struct buffer_head	*head, *bh;
			
 
				+	struct buffer_head	*head, *bh, *next;
			
 
				 	unsigned int		off = 0;
			
 
				+	unsigned int		bsize;
			
 
				 
			
 
				 	ASSERT(bvec->bv_offset < PAGE_SIZE);
			
 
				 	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
			
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
 
				 
			
 
				 	bh = head = page_buffers(bvec->bv_page);
			
 
				 
			
 
				+	bsize = bh->b_size;
			
 
				 	do {
			
 
				+		next = bh->b_this_page;
			
 
				 		if (off < bvec->bv_offset)
			
 
				 			goto next_bh;
			
 
				 		if (off > end)
			
 
				 			break;
			
 
				 		bh->b_end_io(bh, !error);
			
 
				 next_bh:
			
 
				-		off += bh->b_size;
			
 
				-	} while ((bh = bh->b_this_page) != head);
			
 
				+		off += bsize;
			
 
				+	} while ((bh = next) != head);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
 
				 
			
 
				 	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
			
 
				 
			
 
				+	/*
			
 
				+	 * mm accommodates an old ext3 case where clean pages might not have had
			
 
				+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
			
 
				+	 * ->releasepage() via shrink_active_list(). Conversely,
			
 
				+	 * block_invalidatepage() can send pages that are still marked dirty
			
 
				+	 * but otherwise have invalidated buffers.
			
 
				+	 *
			
 
				+	 * We've historically freed buffers on the latter. Instead, quietly
			
 
				+	 * filter out all dirty pages to avoid spurious buffer state warnings.
			
 
				+	 * This can likely be removed once shrink_active_list() is fixed.
			
 
				+	 */
			
 
				+	if (PageDirty(page))
			
 
				+		return 0;
			
 
				+
			
 
				 	xfs_count_page_state(page, &delalloc, &unwritten);
			
 
				 
			
 
				 	if (WARN_ON_ONCE(delalloc))
			
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
 
				 	ssize_t			size;
			
 
				 	int			new = 0;
			
 
				 
			
 
				+	BUG_ON(create && !direct);
			
 
				+
			
 
				 	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				 		return -EIO;
			
 
				 
			
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
 
				 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
			
 
				 	size = bh_result->b_size;
			
 
				 
			
 
				-	if (!create && direct && offset >= i_size_read(inode))
			
 
				+	if (!create && offset >= i_size_read(inode))
			
 
				 		return 0;
			
 
				 
			
 
				 	/*
			
 
				 	 * Direct I/O is usually done on preallocated files, so try getting
			
 
				-	 * a block mapping without an exclusive lock first.  For buffered
			
 
				-	 * writes we already have the exclusive iolock anyway, so avoiding
			
 
				-	 * a lock roundtrip here by taking the ilock exclusive from the
			
 
				-	 * beginning is a useful micro optimization.
			
 
				+	 * a block mapping without an exclusive lock first.
			
 
				 	 */
			
 
				-	if (create && !direct) {
			
 
				-		lockmode = XFS_ILOCK_EXCL;
			
 
				-		xfs_ilock(ip, lockmode);
			
 
				-	} else {
			
 
				-		lockmode = xfs_ilock_data_map_shared(ip);
			
 
				-	}
			
 
				+	lockmode = xfs_ilock_data_map_shared(ip);
			
 
				 
			
 
				 	ASSERT(offset <= mp->m_super->s_maxbytes);
			
 
				 	if (offset + size > mp->m_super->s_maxbytes)
			
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
 
				 	     (imap.br_startblock == HOLESTARTBLOCK ||
			
 
				 	      imap.br_startblock == DELAYSTARTBLOCK) ||
			
 
				 	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
			
 
				-		if (direct || xfs_get_extsz_hint(ip)) {
			
 
				-			/*
			
 
				-			 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				-			 * is unlocked on return.
			
 
				-			 */
			
 
				-			if (lockmode == XFS_ILOCK_EXCL)
			
 
				-				xfs_ilock_demote(ip, lockmode);
			
 
				-
			
 
				-			error = xfs_iomap_write_direct(ip, offset, size,
			
 
				-						       &imap, nimaps);
			
 
				-			if (error)
			
 
				-				return error;
			
 
				-			new = 1;
			
 
				+		/*
			
 
				+		 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				+		 * is unlocked on return.
			
 
				+		 */
			
 
				+		if (lockmode == XFS_ILOCK_EXCL)
			
 
				+			xfs_ilock_demote(ip, lockmode);
			
 
				 
			
 
				-		} else {
			
 
				-			/*
			
 
				-			 * Delalloc reservations do not require a transaction,
			
 
				-			 * we can go on without dropping the lock here. If we
			
 
				-			 * are allocating a new delalloc block, make sure that
			
 
				-			 * we set the new flag so that we mark the buffer new so
			
 
				-			 * that we know that it is newly allocated if the write
			
 
				-			 * fails.
			
 
				-			 */
			
 
				-			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
			
 
				-				new = 1;
			
 
				-			error = xfs_iomap_write_delay(ip, offset, size, &imap);
			
 
				-			if (error)
			
 
				-				goto out_unlock;
			
 
				+		error = xfs_iomap_write_direct(ip, offset, size,
			
 
				+					       &imap, nimaps);
			
 
				+		if (error)
			
 
				+			return error;
			
 
				+		new = 1;
			
 
				 
			
 
				-			xfs_iunlock(ip, lockmode);
			
 
				-		}
			
 
				 		trace_xfs_get_blocks_alloc(ip, offset, size,
			
 
				 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
			
 
				 						   : XFS_IO_DELALLOC, &imap);
			
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
 
				 	}
			
 
				 
			
 
				 	/* trim mapping down to size requested */
			
 
				-	if (direct || size > (1 << inode->i_blkbits))
			
 
				-		xfs_map_trim_size(inode, iblock, bh_result,
			
 
				-				  &imap, offset, size);
			
 
				+	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
			
 
				 
			
 
				 	/*
			
 
				 	 * For unwritten extents do not report a disk address in the buffered
			
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
 
				 		if (ISUNWRITTEN(&imap))
			
 
				 			set_buffer_unwritten(bh_result);
			
 
				 		/* direct IO needs special help */
			
 
				-		if (create && direct) {
			
 
				+		if (create) {
			
 
				 			if (dax_fault)
			
 
				 				ASSERT(!ISUNWRITTEN(&imap));
			
 
				 			else
			
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
 
				 	     (new || ISUNWRITTEN(&imap))))
			
 
				 		set_buffer_new(bh_result);
			
 
				 
			
 
				-	if (imap.br_startblock == DELAYSTARTBLOCK) {
			
 
				-		BUG_ON(direct);
			
 
				-		if (create) {
			
 
				-			set_buffer_uptodate(bh_result);
			
 
				-			set_buffer_mapped(bh_result);
			
 
				-			set_buffer_delay(bh_result);
			
 
				-		}
			
 
				-	}
			
 
				+	BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
			
 
				 
			
 
				 	return 0;
			
 
				 
			
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
 
				  * whereas if we have flags set we will always be called in task context
			
 
				  * (i.e. from a workqueue).
			
 
				  */
			
 
				-STATIC int
			
 
				+int
			
 
				 xfs_end_io_direct_write(
			
 
				 	struct kiocb		*iocb,
			
 
				 	loff_t			offset,
			
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
 
				 	struct kiocb		*iocb,
			
 
				 	struct iov_iter		*iter)
			
 
				 {
			
 
				-	struct inode		*inode = iocb->ki_filp->f_mapping->host;
			
 
				-	dio_iodone_t		*endio = NULL;
			
 
				-	int			flags = 0;
			
 
				-	struct block_device	*bdev;
			
 
				-
			
 
				-	if (iov_iter_rw(iter) == WRITE) {
			
 
				-		endio = xfs_end_io_direct_write;
			
 
				-		flags = DIO_ASYNC_EXTEND;
			
 
				-	}
			
 
				-
			
 
				-	if (IS_DAX(inode)) {
			
 
				-		return dax_do_io(iocb, inode, iter,
			
 
				-				 xfs_get_blocks_direct, endio, 0);
			
 
				-	}
			
 
				-
			
 
				-	bdev = xfs_find_bdev_for_inode(inode);
			
 
				-	return  __blockdev_direct_IO(iocb, inode, bdev, iter,
			
 
				-			xfs_get_blocks_direct, endio, NULL, flags);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Punch out the delalloc blocks we have already allocated.
			
 
				- *
			
 
				- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
			
 
				- * as the page is still locked at this point.
			
 
				- */
			
 
				-STATIC void
			
 
				-xfs_vm_kill_delalloc_range(
			
 
				-	struct inode		*inode,
			
 
				-	loff_t			start,
			
 
				-	loff_t			end)
			
 
				-{
			
 
				-	struct xfs_inode	*ip = XFS_I(inode);
			
 
				-	xfs_fileoff_t		start_fsb;
			
 
				-	xfs_fileoff_t		end_fsb;
			
 
				-	int			error;
			
 
				-
			
 
				-	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
			
 
				-	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
			
 
				-	if (end_fsb <= start_fsb)
			
 
				-		return;
			
 
				-
			
 
				-	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
			
 
				-						end_fsb - start_fsb);
			
 
				-	if (error) {
			
 
				-		/* something screwed, just bail */
			
 
				-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
			
 
				-			xfs_alert(ip->i_mount,
			
 
				-		"xfs_vm_write_failed: unable to clean up ino %lld",
			
 
				-					ip->i_ino);
			
 
				-		}
			
 
				-	}
			
 
				-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-}
			
 
				-
			
 
				-STATIC void
			
 
				-xfs_vm_write_failed(
			
 
				-	struct inode		*inode,
			
 
				-	struct page		*page,
			
 
				-	loff_t			pos,
			
 
				-	unsigned		len)
			
 
				-{
			
 
				-	loff_t			block_offset;
			
 
				-	loff_t			block_start;
			
 
				-	loff_t			block_end;
			
 
				-	loff_t			from = pos & (PAGE_SIZE - 1);
			
 
				-	loff_t			to = from + len;
			
 
				-	struct buffer_head	*bh, *head;
			
 
				-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
			
 
				-
			
 
				 	/*
			
 
				-	 * The request pos offset might be 32 or 64 bit, this is all fine
			
 
				-	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
			
 
				-	 * platform, the high 32-bit will be masked off if we evaluate the
			
 
				-	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
			
 
				-	 * 0xfffff000 as an unsigned long, hence the result is incorrect
			
 
				-	 * which could cause the following ASSERT failed in most cases.
			
 
				-	 * In order to avoid this, we can evaluate the block_offset of the
			
 
				-	 * start of the page by using shifts rather than masks the mismatch
			
 
				-	 * problem.
			
 
				+	 * We just need the method present so that open/fcntl allow direct I/O.
			
 
				 	 */
			
 
				-	block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
			
 
				-
			
 
				-	ASSERT(block_offset + from == pos);
			
 
				-
			
 
				-	head = page_buffers(page);
			
 
				-	block_start = 0;
			
 
				-	for (bh = head; bh != head || !block_start;
			
 
				-	     bh = bh->b_this_page, block_start = block_end,
			
 
				-				   block_offset += bh->b_size) {
			
 
				-		block_end = block_start + bh->b_size;
			
 
				-
			
 
				-		/* skip buffers before the write */
			
 
				-		if (block_end <= from)
			
 
				-			continue;
			
 
				-
			
 
				-		/* if the buffer is after the write, we're done */
			
 
				-		if (block_start >= to)
			
 
				-			break;
			
 
				-
			
 
				-		/*
			
 
				-		 * Process delalloc and unwritten buffers beyond EOF. We can
			
 
				-		 * encounter unwritten buffers in the event that a file has
			
 
				-		 * post-EOF unwritten extents and an extending write happens to
			
 
				-		 * fail (e.g., an unaligned write that also involves a delalloc
			
 
				-		 * to the same page).
			
 
				-		 */
			
 
				-		if (!buffer_delay(bh) && !buffer_unwritten(bh))
			
 
				-			continue;
			
 
				-
			
 
				-		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
			
 
				-		    block_offset < i_size_read(inode))
			
 
				-			continue;
			
 
				-
			
 
				-		if (buffer_delay(bh))
			
 
				-			xfs_vm_kill_delalloc_range(inode, block_offset,
			
 
				-						   block_offset + bh->b_size);
			
 
				-
			
 
				-		/*
			
 
				-		 * This buffer does not contain data anymore. make sure anyone
			
 
				-		 * who finds it knows that for certain.
			
 
				-		 */
			
 
				-		clear_buffer_delay(bh);
			
 
				-		clear_buffer_uptodate(bh);
			
 
				-		clear_buffer_mapped(bh);
			
 
				-		clear_buffer_new(bh);
			
 
				-		clear_buffer_dirty(bh);
			
 
				-		clear_buffer_unwritten(bh);
			
 
				-	}
			
 
				-
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * This used to call block_write_begin(), but it unlocks and releases the page
			
 
				- * on error, and we need that page to be able to punch stale delalloc blocks out
			
 
				- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
			
 
				- * the appropriate point.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_vm_write_begin(
			
 
				-	struct file		*file,
			
 
				-	struct address_space	*mapping,
			
 
				-	loff_t			pos,
			
 
				-	unsigned		len,
			
 
				-	unsigned		flags,
			
 
				-	struct page		**pagep,
			
 
				-	void			**fsdata)
			
 
				-{
			
 
				-	pgoff_t			index = pos >> PAGE_SHIFT;
			
 
				-	struct page		*page;
			
 
				-	int			status;
			
 
				-	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
			
 
				-
			
 
				-	ASSERT(len <= PAGE_SIZE);
			
 
				-
			
 
				-	page = grab_cache_page_write_begin(mapping, index, flags);
			
 
				-	if (!page)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	status = __block_write_begin(page, pos, len, xfs_get_blocks);
			
 
				-	if (xfs_mp_fail_writes(mp))
			
 
				-		status = -EIO;
			
 
				-	if (unlikely(status)) {
			
 
				-		struct inode	*inode = mapping->host;
			
 
				-		size_t		isize = i_size_read(inode);
			
 
				-
			
 
				-		xfs_vm_write_failed(inode, page, pos, len);
			
 
				-		unlock_page(page);
			
 
				-
			
 
				-		/*
			
 
				-		 * If the write is beyond EOF, we only want to kill blocks
			
 
				-		 * allocated in this write, not blocks that were previously
			
 
				-		 * written successfully.
			
 
				-		 */
			
 
				-		if (xfs_mp_fail_writes(mp))
			
 
				-			isize = 0;
			
 
				-		if (pos + len > isize) {
			
 
				-			ssize_t start = max_t(ssize_t, pos, isize);
			
 
				-
			
 
				-			truncate_pagecache_range(inode, start, pos + len);
			
 
				-		}
			
 
				-
			
 
				-		put_page(page);
			
 
				-		page = NULL;
			
 
				-	}
			
 
				-
			
 
				-	*pagep = page;
			
 
				-	return status;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
			
 
				- * this specific write because they will never be written. Previous writes
			
 
				- * beyond EOF where block allocation succeeded do not need to be trashed, so
			
 
				- * only new blocks from this write should be trashed. For blocks within
			
 
				- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
			
 
				- * written with all the other valid data.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_vm_write_end(
			
 
				-	struct file		*file,
			
 
				-	struct address_space	*mapping,
			
 
				-	loff_t			pos,
			
 
				-	unsigned		len,
			
 
				-	unsigned		copied,
			
 
				-	struct page		*page,
			
 
				-	void			*fsdata)
			
 
				-{
			
 
				-	int			ret;
			
 
				-
			
 
				-	ASSERT(len <= PAGE_SIZE);
			
 
				-
			
 
				-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
			
 
				-	if (unlikely(ret < len)) {
			
 
				-		struct inode	*inode = mapping->host;
			
 
				-		size_t		isize = i_size_read(inode);
			
 
				-		loff_t		to = pos + len;
			
 
				-
			
 
				-		if (to > isize) {
			
 
				-			/* only kill blocks in this write beyond EOF */
			
 
				-			if (pos > isize)
			
 
				-				isize = pos;
			
 
				-			xfs_vm_kill_delalloc_range(inode, isize, to);
			
 
				-			truncate_pagecache_range(inode, isize, to);
			
 
				-		}
			
 
				-	}
			
 
				-	return ret;
			
 
				+	return -EINVAL;
			
 
				 }
			
 
				 
			
 
				 STATIC sector_t
			
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
 
				 	.set_page_dirty		= xfs_vm_set_page_dirty,
			
 
				 	.releasepage		= xfs_vm_releasepage,
			
 
				 	.invalidatepage		= xfs_vm_invalidatepage,
			
 
				-	.write_begin		= xfs_vm_write_begin,
			
 
				-	.write_end		= xfs_vm_write_end,
			
 
				 	.bmap			= xfs_vm_bmap,
			
 
				 	.direct_IO		= xfs_vm_direct_IO,
			
 
				 	.migratepage		= buffer_migrate_page,
			
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 
				 int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
			
 
				 			         struct buffer_head *map_bh, int create);
			
 
				 
			
 
				+int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
			
 
				+		ssize_t size, void *private);
			
 
				+
			
 
				 extern void xfs_count_page_state(struct page *, int *, int *);
			
 
				 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
			
 
				 
			
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
 
				  * Recurse (gasp!) through the attribute nodes until we find leaves.
			
 
				  * We're doing a depth-first traversal in order to invalidate everything.
			
 
				  */
			
 
				-int
			
 
				+static int
			
 
				 xfs_attr3_root_inactive(
			
 
				 	struct xfs_trans	**trans,
			
 
				 	struct xfs_inode	*dp)
			
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
 
				  * we have to calculate each entries' hashvalue and sort them before
			
 
				  * we can begin returning them to the user.
			
 
				  */
			
 
				-int
			
 
				+static int
			
 
				 xfs_attr_shortform_list(xfs_attr_list_context_t *context)
			
 
				 {
			
 
				 	attrlist_cursor_kern_t *cursor;
			
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -79,6 +79,23 @@ xfs_zero_extent(
 
				 		GFP_NOFS, true);
			
 
				 }
			
 
				 
			
 
				+/* Sort bmap items by AG. */
			
 
				+static int
			
 
				+xfs_bmap_free_list_cmp(
			
 
				+	void			*priv,
			
 
				+	struct list_head	*a,
			
 
				+	struct list_head	*b)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = priv;
			
 
				+	struct xfs_bmap_free_item	*ra;
			
 
				+	struct xfs_bmap_free_item	*rb;
			
 
				+
			
 
				+	ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
			
 
				+	rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
			
 
				+	return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
			
 
				+		XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
			
 
				  * caller.  Frees all the extents that need freeing, which must be done
			
@@ -99,14 +116,15 @@ xfs_bmap_finish(
 
				 	int				error;	/* error return value */
			
 
				 	int				committed;/* xact committed or not */
			
 
				 	struct xfs_bmap_free_item	*free;	/* free extent item */
			
 
				-	struct xfs_bmap_free_item	*next;	/* next item on free list */
			
 
				 
			
 
				 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
			
 
				 	if (flist->xbf_count == 0)
			
 
				 		return 0;
			
 
				 
			
 
				+	list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
			
 
				+
			
 
				 	efi = xfs_trans_get_efi(*tp, flist->xbf_count);
			
 
				-	for (free = flist->xbf_first; free; free = free->xbfi_next)
			
 
				+	list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
			
 
				 		xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
			
 
				 			free->xbfi_blockcount);
			
 
				 
			
@@ -125,9 +143,7 @@ xfs_bmap_finish(
 
				 		if (committed) {
			
 
				 			xfs_efi_release(efi);
			
 
				 			xfs_force_shutdown((*tp)->t_mountp,
			
 
				-				(error == -EFSCORRUPTED) ?
			
 
				-					SHUTDOWN_CORRUPT_INCORE :
			
 
				-					SHUTDOWN_META_IO_ERROR);
			
 
				+					   SHUTDOWN_META_IO_ERROR);
			
 
				 		}
			
 
				 		return error;
			
 
				 	}
			
@@ -138,15 +154,15 @@ xfs_bmap_finish(
 
				 	 * on error.
			
 
				 	 */
			
 
				 	efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
			
 
				-	for (free = flist->xbf_first; free != NULL; free = next) {
			
 
				-		next = free->xbfi_next;
			
 
				-
			
 
				+	while (!list_empty(&flist->xbf_flist)) {
			
 
				+		free = list_first_entry(&flist->xbf_flist,
			
 
				+				struct xfs_bmap_free_item, xbfi_list);
			
 
				 		error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
			
 
				 					      free->xbfi_blockcount);
			
 
				 		if (error)
			
 
				 			return error;
			
 
				 
			
 
				-		xfs_bmap_del_free(flist, NULL, free);
			
 
				+		xfs_bmap_del_free(flist, free);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
 
				 /*
			
 
				  * Count fsblocks of the given fork.
			
 
				  */
			
 
				-int						/* error */
			
 
				+static int					/* error */
			
 
				 xfs_bmap_count_blocks(
			
 
				 	xfs_trans_t		*tp,		/* transaction pointer */
			
 
				 	xfs_inode_t		*ip,		/* incore inode */
			
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
 
				 		if (error)
			
 
				 			break;
			
 
				 
			
 
				-		ASSERT(!flist.xbf_count && !flist.xbf_first);
			
 
				+		ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
			
 
				 next_block:
			
 
				 		start_fsb++;
			
 
				 		remaining--;
			
@@ -1089,99 +1105,120 @@ error1:	/* Just cancel transaction */
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Zero file bytes between startoff and endoff inclusive.
			
 
				- * The iolock is held exclusive and no blocks are buffered.
			
 
				- *
			
 
				- * This function is used by xfs_free_file_space() to zero
			
 
				- * partial blocks when the range to free is not block aligned.
			
 
				- * When unreserving space with boundaries that are not block
			
 
				- * aligned we round up the start and round down the end
			
 
				- * boundaries and then use this function to zero the parts of
			
 
				- * the blocks that got dropped during the rounding.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_zero_remaining_bytes(
			
 
				-	xfs_inode_t		*ip,
			
 
				-	xfs_off_t		startoff,
			
 
				-	xfs_off_t		endoff)
			
 
				+static int
			
 
				+xfs_unmap_extent(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_fileoff_t		startoffset_fsb,
			
 
				+	xfs_filblks_t		len_fsb,
			
 
				+	int			*done)
			
 
				 {
			
 
				-	xfs_bmbt_irec_t		imap;
			
 
				-	xfs_fileoff_t		offset_fsb;
			
 
				-	xfs_off_t		lastoffset;
			
 
				-	xfs_off_t		offset;
			
 
				-	xfs_buf_t		*bp;
			
 
				-	xfs_mount_t		*mp = ip->i_mount;
			
 
				-	int			nimap;
			
 
				-	int			error = 0;
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_trans	*tp;
			
 
				+	struct xfs_bmap_free	free_list;
			
 
				+	xfs_fsblock_t		firstfsb;
			
 
				+	uint			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
			
 
				+	int			error;
			
 
				 
			
 
				-	/*
			
 
				-	 * Avoid doing I/O beyond eof - it's not necessary
			
 
				-	 * since nothing can read beyond eof.  The space will
			
 
				-	 * be zeroed when the file is extended anyway.
			
 
				-	 */
			
 
				-	if (startoff >= XFS_ISIZE(ip))
			
 
				-		return 0;
			
 
				+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
			
 
				+	if (error) {
			
 
				+		ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
			
 
				+		return error;
			
 
				+	}
			
 
				 
			
 
				-	if (endoff > XFS_ISIZE(ip))
			
 
				-		endoff = XFS_ISIZE(ip);
			
 
				+	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+	error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
			
 
				+			ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
			
 
				+	if (error)
			
 
				+		goto out_trans_cancel;
			
 
				 
			
 
				-	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
			
 
				-		uint lock_mode;
			
 
				+	xfs_trans_ijoin(tp, ip, 0);
			
 
				 
			
 
				-		offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				-		nimap = 1;
			
 
				+	xfs_bmap_init(&free_list, &firstfsb);
			
 
				+	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
			
 
				+			&free_list, done);
			
 
				+	if (error)
			
 
				+		goto out_bmap_cancel;
			
 
				 
			
 
				-		lock_mode = xfs_ilock_data_map_shared(ip);
			
 
				-		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
			
 
				-		xfs_iunlock(ip, lock_mode);
			
 
				+	error = xfs_bmap_finish(&tp, &free_list, NULL);
			
 
				+	if (error)
			
 
				+		goto out_bmap_cancel;
			
 
				 
			
 
				-		if (error || nimap < 1)
			
 
				-			break;
			
 
				-		ASSERT(imap.br_blockcount >= 1);
			
 
				-		ASSERT(imap.br_startoff == offset_fsb);
			
 
				-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				+	error = xfs_trans_commit(tp);
			
 
				+out_unlock:
			
 
				+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+	return error;
			
 
				 
			
 
				-		if (imap.br_startblock == HOLESTARTBLOCK ||
			
 
				-		    imap.br_state == XFS_EXT_UNWRITTEN) {
			
 
				-			/* skip the entire extent */
			
 
				-			lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
			
 
				-						      imap.br_blockcount) - 1;
			
 
				-			continue;
			
 
				-		}
			
 
				+out_bmap_cancel:
			
 
				+	xfs_bmap_cancel(&free_list);
			
 
				+out_trans_cancel:
			
 
				+	xfs_trans_cancel(tp);
			
 
				+	goto out_unlock;
			
 
				+}
			
 
				 
			
 
				-		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
			
 
				-		if (lastoffset > endoff)
			
 
				-			lastoffset = endoff;
			
 
				+static int
			
 
				+xfs_adjust_extent_unmap_boundaries(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_fileoff_t		*startoffset_fsb,
			
 
				+	xfs_fileoff_t		*endoffset_fsb)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_bmbt_irec	imap;
			
 
				+	int			nimap, error;
			
 
				+	xfs_extlen_t		mod = 0;
			
 
				 
			
 
				-		/* DAX can just zero the backing device directly */
			
 
				-		if (IS_DAX(VFS_I(ip))) {
			
 
				-			error = dax_zero_page_range(VFS_I(ip), offset,
			
 
				-						    lastoffset - offset + 1,
			
 
				-						    xfs_get_blocks_direct);
			
 
				-			if (error)
			
 
				-				return error;
			
 
				-			continue;
			
 
				-		}
			
 
				+	nimap = 1;
			
 
				+	error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				 
			
 
				-		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
			
 
				-				mp->m_rtdev_targp : mp->m_ddev_targp,
			
 
				-				xfs_fsb_to_db(ip, imap.br_startblock),
			
 
				-				BTOBB(mp->m_sb.sb_blocksize),
			
 
				-				0, &bp, NULL);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				+	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				+		xfs_daddr_t	block;
			
 
				 
			
 
				-		memset(bp->b_addr +
			
 
				-				(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
			
 
				-		       0, lastoffset - offset + 1);
			
 
				+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				+		block = imap.br_startblock;
			
 
				+		mod = do_div(block, mp->m_sb.sb_rextsize);
			
 
				+		if (mod)
			
 
				+			*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
			
 
				+	}
			
 
				 
			
 
				-		error = xfs_bwrite(bp);
			
 
				-		xfs_buf_relse(bp);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				+	nimap = 1;
			
 
				+	error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+
			
 
				+	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				+		mod++;
			
 
				+		if (mod && mod != mp->m_sb.sb_rextsize)
			
 
				+			*endoffset_fsb -= mod;
			
 
				 	}
			
 
				-	return error;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_flush_unmap_range(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_off_t		offset,
			
 
				+	xfs_off_t		len)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct inode		*inode = VFS_I(ip);
			
 
				+	xfs_off_t		rounding, start, end;
			
 
				+	int			error;
			
 
				+
			
 
				+	/* wait for the completion of any pending DIOs */
			
 
				+	inode_dio_wait(inode);
			
 
				+
			
 
				+	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
			
 
				+	start = round_down(offset, rounding);
			
 
				+	end = round_up(offset + len, rounding) - 1;
			
 
				+
			
 
				+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+	truncate_pagecache_range(inode, start, end);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 int
			
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
 
				 	xfs_off_t		offset,
			
 
				 	xfs_off_t		len)
			
 
				 {
			
 
				-	int			done;
			
 
				-	xfs_fileoff_t		endoffset_fsb;
			
 
				-	int			error;
			
 
				-	xfs_fsblock_t		firstfsb;
			
 
				-	xfs_bmap_free_t		free_list;
			
 
				-	xfs_bmbt_irec_t		imap;
			
 
				-	xfs_off_t		ioffset;
			
 
				-	xfs_off_t		iendoffset;
			
 
				-	xfs_extlen_t		mod=0;
			
 
				-	xfs_mount_t		*mp;
			
 
				-	int			nimap;
			
 
				-	uint			resblks;
			
 
				-	xfs_off_t		rounding;
			
 
				-	int			rt;
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				 	xfs_fileoff_t		startoffset_fsb;
			
 
				-	xfs_trans_t		*tp;
			
 
				-
			
 
				-	mp = ip->i_mount;
			
 
				+	xfs_fileoff_t		endoffset_fsb;
			
 
				+	int			done = 0, error;
			
 
				 
			
 
				 	trace_xfs_free_file_space(ip);
			
 
				 
			
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
 
				 	if (error)
			
 
				 		return error;
			
 
				 
			
 
				-	error = 0;
			
 
				 	if (len <= 0)	/* if nothing being freed */
			
 
				-		return error;
			
 
				-	rt = XFS_IS_REALTIME_INODE(ip);
			
 
				-	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
			
 
				-	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
			
 
				-
			
 
				-	/* wait for the completion of any pending DIOs */
			
 
				-	inode_dio_wait(VFS_I(ip));
			
 
				+		return 0;
			
 
				 
			
 
				-	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
			
 
				-	ioffset = round_down(offset, rounding);
			
 
				-	iendoffset = round_up(offset + len, rounding) - 1;
			
 
				-	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
			
 
				-					     iendoffset);
			
 
				+	error = xfs_flush_unmap_range(ip, offset, len);
			
 
				 	if (error)
			
 
				-		goto out;
			
 
				-	truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
			
 
				+		return error;
			
 
				+
			
 
				+	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
			
 
				+	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
			
 
				 
			
 
				 	/*
			
 
				-	 * Need to zero the stuff we're not freeing, on disk.
			
 
				-	 * If it's a realtime file & can't use unwritten extents then we
			
 
				-	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
			
 
				-	 * will take care of it for us.
			
 
				+	 * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
			
 
				+	 * and we can't use unwritten extents then we actually need to ensure
			
 
				+	 * to zero the whole extent, otherwise we just need to take of block
			
 
				+	 * boundaries, and xfs_bunmapi will handle the rest.
			
 
				 	 */
			
 
				-	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
			
 
				-		nimap = 1;
			
 
				-		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
			
 
				-					&imap, &nimap, 0);
			
 
				-		if (error)
			
 
				-			goto out;
			
 
				-		ASSERT(nimap == 0 || nimap == 1);
			
 
				-		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				-			xfs_daddr_t	block;
			
 
				-
			
 
				-			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				-			block = imap.br_startblock;
			
 
				-			mod = do_div(block, mp->m_sb.sb_rextsize);
			
 
				-			if (mod)
			
 
				-				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
			
 
				-		}
			
 
				-		nimap = 1;
			
 
				-		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
			
 
				-					&imap, &nimap, 0);
			
 
				+	if (XFS_IS_REALTIME_INODE(ip) &&
			
 
				+	    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
			
 
				+		error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
			
 
				+				&endoffset_fsb);
			
 
				 		if (error)
			
 
				-			goto out;
			
 
				-		ASSERT(nimap == 0 || nimap == 1);
			
 
				-		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				-			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				-			mod++;
			
 
				-			if (mod && (mod != mp->m_sb.sb_rextsize))
			
 
				-				endoffset_fsb -= mod;
			
 
				-		}
			
 
				-	}
			
 
				-	if ((done = (endoffset_fsb <= startoffset_fsb)))
			
 
				-		/*
			
 
				-		 * One contiguous piece to clear
			
 
				-		 */
			
 
				-		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
			
 
				-	else {
			
 
				-		/*
			
 
				-		 * Some full blocks, possibly two pieces to clear
			
 
				-		 */
			
 
				-		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
			
 
				-			error = xfs_zero_remaining_bytes(ip, offset,
			
 
				-				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
			
 
				-		if (!error &&
			
 
				-		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
			
 
				-			error = xfs_zero_remaining_bytes(ip,
			
 
				-				XFS_FSB_TO_B(mp, endoffset_fsb),
			
 
				-				offset + len - 1);
			
 
				+			return error;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * free file space until done or until there is an error
			
 
				-	 */
			
 
				-	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
			
 
				-	while (!error && !done) {
			
 
				-
			
 
				-		/*
			
 
				-		 * allocate and setup the transaction. Allow this
			
 
				-		 * transaction to dip into the reserve blocks to ensure
			
 
				-		 * the freeing of the space succeeds at ENOSPC.
			
 
				-		 */
			
 
				-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
			
 
				-				&tp);
			
 
				-		if (error) {
			
 
				-			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
			
 
				-			break;
			
 
				+	if (endoffset_fsb > startoffset_fsb) {
			
 
				+		while (!done) {
			
 
				+			error = xfs_unmap_extent(ip, startoffset_fsb,
			
 
				+					endoffset_fsb - startoffset_fsb, &done);
			
 
				+			if (error)
			
 
				+				return error;
			
 
				 		}
			
 
				-		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-		error = xfs_trans_reserve_quota(tp, mp,
			
 
				-				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
			
 
				-				resblks, 0, XFS_QMOPT_RES_REGBLKS);
			
 
				-		if (error)
			
 
				-			goto error1;
			
 
				-
			
 
				-		xfs_trans_ijoin(tp, ip, 0);
			
 
				-
			
 
				-		/*
			
 
				-		 * issue the bunmapi() call to free the blocks
			
 
				-		 */
			
 
				-		xfs_bmap_init(&free_list, &firstfsb);
			
 
				-		error = xfs_bunmapi(tp, ip, startoffset_fsb,
			
 
				-				  endoffset_fsb - startoffset_fsb,
			
 
				-				  0, 2, &firstfsb, &free_list, &done);
			
 
				-		if (error)
			
 
				-			goto error0;
			
 
				-
			
 
				-		/*
			
 
				-		 * complete the transaction
			
 
				-		 */
			
 
				-		error = xfs_bmap_finish(&tp, &free_list, NULL);
			
 
				-		if (error)
			
 
				-			goto error0;
			
 
				-
			
 
				-		error = xfs_trans_commit(tp);
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				 	}
			
 
				 
			
 
				- out:
			
 
				-	return error;
			
 
				-
			
 
				- error0:
			
 
				-	xfs_bmap_cancel(&free_list);
			
 
				- error1:
			
 
				-	xfs_trans_cancel(tp);
			
 
				-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-	goto out;
			
 
				+	/*
			
 
				+	 * Now that we've unmap all full blocks we'll have to zero out any
			
 
				+	 * partial block at the beginning and/or end.  xfs_zero_range is
			
 
				+	 * smart enough to skip any holes, including those we just created.
			
 
				+	 */
			
 
				+	return xfs_zero_range(ip, offset, len, NULL);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
 
				 int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
			
 
				 int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
			
 
				 		     int whichfork, int *eof);
			
 
				-int	xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
			
 
				-			      int whichfork, int *count);
			
 
				 int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
			
 
				 		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
			
 
				 
			
@@ -43,7 +41,6 @@ int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
 
				 
			
 
				 /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
			
 
				 void	xfs_bmap_del_free(struct xfs_bmap_free *flist,
			
 
				-			  struct xfs_bmap_free_item *prev,
			
 
				 			  struct xfs_bmap_free_item *free);
			
 
				 int	xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
			
 
				 			       struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
			
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -79,6 +79,47 @@ xfs_buf_vmap_len(
 
				 	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
			
 
				+ * this buffer. The count is incremented once per buffer (per hold cycle)
			
 
				+ * because the corresponding decrement is deferred to buffer release. Buffers
			
 
				+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
			
 
				+ * tracking adds unnecessary overhead. This is used for sychronization purposes
			
 
				+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
			
 
				+ * in-flight buffers.
			
 
				+ *
			
 
				+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
			
 
				+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
			
 
				+ * never reaches zero and unmount hangs indefinitely.
			
 
				+ */
			
 
				+static inline void
			
 
				+xfs_buf_ioacct_inc(
			
 
				+	struct xfs_buf	*bp)
			
 
				+{
			
 
				+	if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
			
 
				+		return;
			
 
				+
			
 
				+	ASSERT(bp->b_flags & XBF_ASYNC);
			
 
				+	bp->b_flags |= _XBF_IN_FLIGHT;
			
 
				+	percpu_counter_inc(&bp->b_target->bt_io_count);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Clear the in-flight state on a buffer about to be released to the LRU or
			
 
				+ * freed and unaccount from the buftarg.
			
 
				+ */
			
 
				+static inline void
			
 
				+xfs_buf_ioacct_dec(
			
 
				+	struct xfs_buf	*bp)
			
 
				+{
			
 
				+	if (!(bp->b_flags & _XBF_IN_FLIGHT))
			
 
				+		return;
			
 
				+
			
 
				+	ASSERT(bp->b_flags & XBF_ASYNC);
			
 
				+	bp->b_flags &= ~_XBF_IN_FLIGHT;
			
 
				+	percpu_counter_dec(&bp->b_target->bt_io_count);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
			
 
				  * b_lru_ref count so that the buffer is freed immediately when the buffer
			
@@ -102,6 +143,14 @@ xfs_buf_stale(
 
				 	 */
			
 
				 	bp->b_flags &= ~_XBF_DELWRI_Q;
			
 
				 
			
 
				+	/*
			
 
				+	 * Once the buffer is marked stale and unlocked, a subsequent lookup
			
 
				+	 * could reset b_flags. There is no guarantee that the buffer is
			
 
				+	 * unaccounted (released to LRU) before that occurs. Drop in-flight
			
 
				+	 * status now to preserve accounting consistency.
			
 
				+	 */
			
 
				+	xfs_buf_ioacct_dec(bp);
			
 
				+
			
 
				 	spin_lock(&bp->b_lock);
			
 
				 	atomic_set(&bp->b_lru_ref, 0);
			
 
				 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
			
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
 
				 	struct xfs_buf		*bp;
			
 
				 	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
			
 
				 
			
 
				-	bp = _xfs_buf_alloc(target, &map, 1, 0);
			
 
				+	/* flags might contain irrelevant bits, pass only what we care about */
			
 
				+	bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
			
 
				 	if (unlikely(bp == NULL))
			
 
				 		goto fail;
			
 
				 
			
@@ -866,63 +916,85 @@ xfs_buf_hold(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	Releases a hold on the specified buffer.  If the
			
 
				- *	the hold count is 1, calls xfs_buf_free.
			
 
				+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
			
 
				+ * placed on LRU or freed (depending on b_lru_ref).
			
 
				  */
			
 
				 void
			
 
				 xfs_buf_rele(
			
 
				 	xfs_buf_t		*bp)
			
 
				 {
			
 
				 	struct xfs_perag	*pag = bp->b_pag;
			
 
				+	bool			release;
			
 
				+	bool			freebuf = false;
			
 
				 
			
 
				 	trace_xfs_buf_rele(bp, _RET_IP_);
			
 
				 
			
 
				 	if (!pag) {
			
 
				 		ASSERT(list_empty(&bp->b_lru));
			
 
				 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
			
 
				-		if (atomic_dec_and_test(&bp->b_hold))
			
 
				+		if (atomic_dec_and_test(&bp->b_hold)) {
			
 
				+			xfs_buf_ioacct_dec(bp);
			
 
				 			xfs_buf_free(bp);
			
 
				+		}
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				 	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
			
 
				 
			
 
				 	ASSERT(atomic_read(&bp->b_hold) > 0);
			
 
				-	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
			
 
				-		spin_lock(&bp->b_lock);
			
 
				-		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
			
 
				-			/*
			
 
				-			 * If the buffer is added to the LRU take a new
			
 
				-			 * reference to the buffer for the LRU and clear the
			
 
				-			 * (now stale) dispose list state flag
			
 
				-			 */
			
 
				-			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
			
 
				-				bp->b_state &= ~XFS_BSTATE_DISPOSE;
			
 
				-				atomic_inc(&bp->b_hold);
			
 
				-			}
			
 
				-			spin_unlock(&bp->b_lock);
			
 
				-			spin_unlock(&pag->pag_buf_lock);
			
 
				-		} else {
			
 
				-			/*
			
 
				-			 * most of the time buffers will already be removed from
			
 
				-			 * the LRU, so optimise that case by checking for the
			
 
				-			 * XFS_BSTATE_DISPOSE flag indicating the last list the
			
 
				-			 * buffer was on was the disposal list
			
 
				-			 */
			
 
				-			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
			
 
				-				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
			
 
				-			} else {
			
 
				-				ASSERT(list_empty(&bp->b_lru));
			
 
				-			}
			
 
				-			spin_unlock(&bp->b_lock);
			
 
				 
			
 
				-			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
			
 
				-			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			
 
				-			spin_unlock(&pag->pag_buf_lock);
			
 
				-			xfs_perag_put(pag);
			
 
				-			xfs_buf_free(bp);
			
 
				+	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
			
 
				+	spin_lock(&bp->b_lock);
			
 
				+	if (!release) {
			
 
				+		/*
			
 
				+		 * Drop the in-flight state if the buffer is already on the LRU
			
 
				+		 * and it holds the only reference. This is racy because we
			
 
				+		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
			
 
				+		 * ensures the decrement occurs only once per-buf.
			
 
				+		 */
			
 
				+		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
			
 
				+			xfs_buf_ioacct_dec(bp);
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/* the last reference has been dropped ... */
			
 
				+	xfs_buf_ioacct_dec(bp);
			
 
				+	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
			
 
				+		/*
			
 
				+		 * If the buffer is added to the LRU take a new reference to the
			
 
				+		 * buffer for the LRU and clear the (now stale) dispose list
			
 
				+		 * state flag
			
 
				+		 */
			
 
				+		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
			
 
				+			bp->b_state &= ~XFS_BSTATE_DISPOSE;
			
 
				+			atomic_inc(&bp->b_hold);
			
 
				+		}
			
 
				+		spin_unlock(&pag->pag_buf_lock);
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * most of the time buffers will already be removed from the
			
 
				+		 * LRU, so optimise that case by checking for the
			
 
				+		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
			
 
				+		 * was on was the disposal list
			
 
				+		 */
			
 
				+		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
			
 
				+			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
			
 
				+		} else {
			
 
				+			ASSERT(list_empty(&bp->b_lru));
			
 
				 		}
			
 
				+
			
 
				+		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
			
 
				+		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			
 
				+		spin_unlock(&pag->pag_buf_lock);
			
 
				+		xfs_perag_put(pag);
			
 
				+		freebuf = true;
			
 
				 	}
			
 
				+
			
 
				+out_unlock:
			
 
				+	spin_unlock(&bp->b_lock);
			
 
				+
			
 
				+	if (freebuf)
			
 
				+		xfs_buf_free(bp);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
 
				 	int			locked;
			
 
				 
			
 
				 	locked = down_trylock(&bp->b_sema) == 0;
			
 
				-	if (locked)
			
 
				+	if (locked) {
			
 
				 		XB_SET_OWNER(bp);
			
 
				-
			
 
				-	trace_xfs_buf_trylock(bp, _RET_IP_);
			
 
				+		trace_xfs_buf_trylock(bp, _RET_IP_);
			
 
				+	} else {
			
 
				+		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
			
 
				+	}
			
 
				 	return locked;
			
 
				 }
			
 
				 
			
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
 
				 	 * xfs_buf_ioend too early.
			
 
				 	 */
			
 
				 	atomic_set(&bp->b_io_remaining, 1);
			
 
				+	xfs_buf_ioacct_inc(bp);
			
 
				 	_xfs_buf_ioapply(bp);
			
 
				 
			
 
				 	/*
			
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
 
				 	int loop = 0;
			
 
				 
			
 
				 	/*
			
 
				-	 * We need to flush the buffer workqueue to ensure that all IO
			
 
				-	 * completion processing is 100% done. Just waiting on buffer locks is
			
 
				-	 * not sufficient for async IO as the reference count held over IO is
			
 
				-	 * not released until after the buffer lock is dropped. Hence we need to
			
 
				-	 * ensure here that all reference counts have been dropped before we
			
 
				-	 * start walking the LRU list.
			
 
				+	 * First wait on the buftarg I/O count for all in-flight buffers to be
			
 
				+	 * released. This is critical as new buffers do not make the LRU until
			
 
				+	 * they are released.
			
 
				+	 *
			
 
				+	 * Next, flush the buffer workqueue to ensure all completion processing
			
 
				+	 * has finished. Just waiting on buffer locks is not sufficient for
			
 
				+	 * async IO as the reference count held over IO is not released until
			
 
				+	 * after the buffer lock is dropped. Hence we need to ensure here that
			
 
				+	 * all reference counts have been dropped before we start walking the
			
 
				+	 * LRU list.
			
 
				 	 */
			
 
				+	while (percpu_counter_sum(&btp->bt_io_count))
			
 
				+		delay(100);
			
 
				 	drain_workqueue(btp->bt_mount->m_buf_workqueue);
			
 
				 
			
 
				 	/* loop until there is nothing left on the lru list. */
			
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
 
				 	struct xfs_buftarg	*btp)
			
 
				 {
			
 
				 	unregister_shrinker(&btp->bt_shrinker);
			
 
				+	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
			
 
				+	percpu_counter_destroy(&btp->bt_io_count);
			
 
				 	list_lru_destroy(&btp->bt_lru);
			
 
				 
			
 
				 	if (mp->m_flags & XFS_MOUNT_BARRIER)
			
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
 
				 	if (list_lru_init(&btp->bt_lru))
			
 
				 		goto error;
			
 
				 
			
 
				+	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
			
 
				+		goto error;
			
 
				+
			
 
				 	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
			
 
				 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
			
 
				 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
			
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * submit buffers for write.
			
 
				+ *
			
 
				+ * When we have a large buffer list, we do not want to hold all the buffers
			
 
				+ * locked while we block on the request queue waiting for IO dispatch. To avoid
			
 
				+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
			
 
				+ * the lock hold times for lists which may contain thousands of objects.
			
 
				+ *
			
 
				+ * To do this, we sort the buffer list before we walk the list to lock and
			
 
				+ * submit buffers, and we plug and unplug around each group of buffers we
			
 
				+ * submit.
			
 
				+ */
			
 
				 static int
			
 
				-__xfs_buf_delwri_submit(
			
 
				+xfs_buf_delwri_submit_buffers(
			
 
				 	struct list_head	*buffer_list,
			
 
				-	struct list_head	*io_list,
			
 
				-	bool			wait)
			
 
				+	struct list_head	*wait_list)
			
 
				 {
			
 
				-	struct blk_plug		plug;
			
 
				 	struct xfs_buf		*bp, *n;
			
 
				+	LIST_HEAD		(submit_list);
			
 
				 	int			pinned = 0;
			
 
				+	struct blk_plug		plug;
			
 
				 
			
 
				+	list_sort(NULL, buffer_list, xfs_buf_cmp);
			
 
				+
			
 
				+	blk_start_plug(&plug);
			
 
				 	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
			
 
				-		if (!wait) {
			
 
				+		if (!wait_list) {
			
 
				 			if (xfs_buf_ispinned(bp)) {
			
 
				 				pinned++;
			
 
				 				continue;
			
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		list_move_tail(&bp->b_list, io_list);
			
 
				 		trace_xfs_buf_delwri_split(bp, _RET_IP_);
			
 
				-	}
			
 
				-
			
 
				-	list_sort(NULL, io_list, xfs_buf_cmp);
			
 
				-
			
 
				-	blk_start_plug(&plug);
			
 
				-	list_for_each_entry_safe(bp, n, io_list, b_list) {
			
 
				-		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
			
 
				-		bp->b_flags |= XBF_WRITE | XBF_ASYNC;
			
 
				 
			
 
				 		/*
			
 
				-		 * we do all Io submission async. This means if we need to wait
			
 
				-		 * for IO completion we need to take an extra reference so the
			
 
				-		 * buffer is still valid on the other side.
			
 
				+		 * We do all IO submission async. This means if we need
			
 
				+		 * to wait for IO completion we need to take an extra
			
 
				+		 * reference so the buffer is still valid on the other
			
 
				+		 * side. We need to move the buffer onto the io_list
			
 
				+		 * at this point so the caller can still access it.
			
 
				 		 */
			
 
				-		if (wait)
			
 
				+		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
			
 
				+		bp->b_flags |= XBF_WRITE | XBF_ASYNC;
			
 
				+		if (wait_list) {
			
 
				 			xfs_buf_hold(bp);
			
 
				-		else
			
 
				+			list_move_tail(&bp->b_list, wait_list);
			
 
				+		} else
			
 
				 			list_del_init(&bp->b_list);
			
 
				 
			
 
				 		xfs_buf_submit(bp);
			
@@ -1849,8 +1946,7 @@ int
 
				 xfs_buf_delwri_submit_nowait(
			
 
				 	struct list_head	*buffer_list)
			
 
				 {
			
 
				-	LIST_HEAD		(io_list);
			
 
				-	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
			
 
				+	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1865,15 +1961,15 @@ int
 
				 xfs_buf_delwri_submit(
			
 
				 	struct list_head	*buffer_list)
			
 
				 {
			
 
				-	LIST_HEAD		(io_list);
			
 
				+	LIST_HEAD		(wait_list);
			
 
				 	int			error = 0, error2;
			
 
				 	struct xfs_buf		*bp;
			
 
				 
			
 
				-	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
			
 
				+	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
			
 
				 
			
 
				 	/* Wait for IO to complete. */
			
 
				-	while (!list_empty(&io_list)) {
			
 
				-		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
			
 
				+	while (!list_empty(&wait_list)) {
			
 
				+		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
			
 
				 
			
 
				 		list_del_init(&bp->b_list);
			
 
				 
			
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
 
				 #define XBF_READ	 (1 << 0) /* buffer intended for reading from device */
			
 
				 #define XBF_WRITE	 (1 << 1) /* buffer intended for writing to device */
			
 
				 #define XBF_READ_AHEAD	 (1 << 2) /* asynchronous read-ahead */
			
 
				+#define XBF_NO_IOACCT	 (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
			
 
				 #define XBF_ASYNC	 (1 << 4) /* initiator will not wait for completion */
			
 
				 #define XBF_DONE	 (1 << 5) /* all pages in the buffer uptodate */
			
 
				 #define XBF_STALE	 (1 << 6) /* buffer has been staled, do not find it */
			
@@ -62,6 +63,7 @@ typedef enum {
 
				 #define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
			
 
				 #define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
			
 
				 #define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
			
 
				+#define _XBF_IN_FLIGHT	 (1 << 25) /* I/O in flight, for accounting purposes */
			
 
				 
			
 
				 typedef unsigned int xfs_buf_flags_t;
			
 
				 
			
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
 
				 	{ _XBF_PAGES,		"PAGES" }, \
			
 
				 	{ _XBF_KMEM,		"KMEM" }, \
			
 
				 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
			
 
				-	{ _XBF_COMPOUND,	"COMPOUND" }
			
 
				+	{ _XBF_COMPOUND,	"COMPOUND" }, \
			
 
				+	{ _XBF_IN_FLIGHT,	"IN_FLIGHT" }
			
 
				 
			
 
				 
			
 
				 /*
			
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
 
				 	/* LRU control structures */
			
 
				 	struct shrinker		bt_shrinker;
			
 
				 	struct list_lru		bt_lru;
			
 
				+
			
 
				+	struct percpu_counter	bt_io_count;
			
 
				 } xfs_buftarg_t;
			
 
				 
			
 
				 struct xfs_buf;
			
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
 
				 	for (i = 0; i < bip->bli_format_count; i++) {
			
 
				 		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
			
 
				 					    &bip->bli_formats[i]);
			
 
				-		offset += bp->b_maps[i].bm_len;
			
 
				+		offset += BBTOB(bp->b_maps[i].bm_len);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -915,20 +915,28 @@ xfs_buf_item_log(
 
				 	for (i = 0; i < bip->bli_format_count; i++) {
			
 
				 		if (start > last)
			
 
				 			break;
			
 
				-		end = start + BBTOB(bp->b_maps[i].bm_len);
			
 
				+		end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
			
 
				+
			
 
				+		/* skip to the map that includes the first byte to log */
			
 
				 		if (first > end) {
			
 
				 			start += BBTOB(bp->b_maps[i].bm_len);
			
 
				 			continue;
			
 
				 		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Trim the range to this segment and mark it in the bitmap.
			
 
				+		 * Note that we must convert buffer offsets to segment relative
			
 
				+		 * offsets (e.g., the first byte of each segment is byte 0 of
			
 
				+		 * that segment).
			
 
				+		 */
			
 
				 		if (first < start)
			
 
				 			first = start;
			
 
				 		if (end > last)
			
 
				 			end = last;
			
 
				-
			
 
				-		xfs_buf_item_log_segment(first, end,
			
 
				+		xfs_buf_item_log_segment(first - start, end - start,
			
 
				 					 &bip->bli_formats[i].blf_data_map[0]);
			
 
				 
			
 
				-		start += bp->b_maps[i].bm_len;
			
 
				+		start += BBTOB(bp->b_maps[i].bm_len);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -949,6 +957,7 @@ xfs_buf_item_free(
 
				 	xfs_buf_log_item_t	*bip)
			
 
				 {
			
 
				 	xfs_buf_item_free_format(bip);
			
 
				+	kmem_free(bip->bli_item.li_lv_shadow);
			
 
				 	kmem_zone_free(xfs_buf_item_zone, bip);
			
 
				 }
			
 
				 
			
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
 
				 	trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
			
 
				 	ASSERT(bp->b_iodone != NULL);
			
 
				 
			
 
				+	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
			
 
				+
			
 
				 	/*
			
 
				 	 * If the write was asynchronous then no one will be looking for the
			
 
				 	 * error.  If this is the first failure of this type, clear the error
			
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
 
				 	 * async write failure at least once, but we also need to set the buffer
			
 
				 	 * up to behave correctly now for repeated failures.
			
 
				 	 */
			
 
				-	if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
			
 
				+	if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
			
 
				 	     bp->b_last_error != bp->b_error) {
			
 
				-		bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
			
 
				-			        XBF_DONE | XBF_WRITE_FAIL);
			
 
				+		bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
			
 
				 		bp->b_last_error = bp->b_error;
			
 
				-		bp->b_retries = 0;
			
 
				-		bp->b_first_retry_time = jiffies;
			
 
				+		if (cfg->retry_timeout && !bp->b_first_retry_time)
			
 
				+			bp->b_first_retry_time = jiffies;
			
 
				 
			
 
				 		xfs_buf_ioerror(bp, 0);
			
 
				 		xfs_buf_submit(bp);
			
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
 
				 	 * Repeated failure on an async write. Take action according to the
			
 
				 	 * error configuration we have been set up to use.
			
 
				 	 */
			
 
				-	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
			
 
				 
			
 
				 	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
			
 
				 	    ++bp->b_retries > cfg->max_retries)
			
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
 
				 {
			
 
				 	ASSERT(list_empty(&dqp->q_lru));
			
 
				 
			
 
				+	kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
			
 
				 	mutex_destroy(&dqp->q_qlock);
			
 
				 
			
 
				 	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
			
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
 
				 	spin_lock(&ailp->xa_lock);
			
 
				 	xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
			
 
				 
			
 
				+	kmem_free(qfs->qql_item.li_lv_shadow);
			
 
				+	kmem_free(lip->li_lv_shadow);
			
 
				 	kmem_free(qfs);
			
 
				 	kmem_free(qfe);
			
 
				 	return (xfs_lsn_t)-1;
			
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 
				 }
			
 
				 
			
 
				 int
			
 
				-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
			
 
				+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
			
 
				 {
			
 
				 	int i;
			
 
				 	int len;
			
 
				 	int64_t fsid;
			
 
				 
			
 
				+	if (error_tag >= XFS_ERRTAG_MAX)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
			
 
				 
			
 
				 	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
			
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 
				 	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
			
 
				 			(rf))))
			
 
				 
			
 
				-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
			
 
				+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
			
 
				 extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
			
 
				 #else
			
 
				 #define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
			
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
 
				 xfs_efi_item_free(
			
 
				 	struct xfs_efi_log_item	*efip)
			
 
				 {
			
 
				+	kmem_free(efip->efi_item.li_lv_shadow);
			
 
				 	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
			
 
				 		kmem_free(efip);
			
 
				 	else
			
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 
				 STATIC void
			
 
				 xfs_efd_item_free(struct xfs_efd_log_item *efdp)
			
 
				 {
			
 
				+	kmem_free(efdp->efd_item.li_lv_shadow);
			
 
				 	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
			
 
				 		kmem_free(efdp);
			
 
				 	else
			
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 
				 #include "xfs_log.h"
			
 
				 #include "xfs_icache.h"
			
 
				 #include "xfs_pnfs.h"
			
 
				+#include "xfs_iomap.h"
			
 
				 
			
 
				 #include <linux/dcache.h>
			
 
				 #include <linux/falloc.h>
			
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * xfs_iozero clears the specified range supplied via the page cache (except in
			
 
				- * the DAX case). Writes through the page cache will allocate blocks over holes,
			
 
				- * though the callers usually map the holes first and avoid them. If a block is
			
 
				- * not completely zeroed, then it will be read from disk before being partially
			
 
				- * zeroed.
			
 
				- *
			
 
				- * In the DAX case, we can just directly write to the underlying pages. This
			
 
				- * will not allocate blocks, but will avoid holes and unwritten extents and so
			
 
				- * not do unnecessary work.
			
 
				+ * Clear the specified ranges to zero through either the pagecache or DAX.
			
 
				+ * Holes and unwritten extents will be left as-is as they already are zeroed.
			
 
				  */
			
 
				 int
			
 
				-xfs_iozero(
			
 
				-	struct xfs_inode	*ip,	/* inode			*/
			
 
				-	loff_t			pos,	/* offset in file		*/
			
 
				-	size_t			count)	/* size of data to zero		*/
			
 
				+xfs_zero_range(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_off_t		pos,
			
 
				+	xfs_off_t		count,
			
 
				+	bool			*did_zero)
			
 
				 {
			
 
				-	struct page		*page;
			
 
				-	struct address_space	*mapping;
			
 
				-	int			status = 0;
			
 
				-
			
 
				-
			
 
				-	mapping = VFS_I(ip)->i_mapping;
			
 
				-	do {
			
 
				-		unsigned offset, bytes;
			
 
				-		void *fsdata;
			
 
				-
			
 
				-		offset = (pos & (PAGE_SIZE -1)); /* Within page */
			
 
				-		bytes = PAGE_SIZE - offset;
			
 
				-		if (bytes > count)
			
 
				-			bytes = count;
			
 
				-
			
 
				-		if (IS_DAX(VFS_I(ip))) {
			
 
				-			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
			
 
				-						     xfs_get_blocks_direct);
			
 
				-			if (status)
			
 
				-				break;
			
 
				-		} else {
			
 
				-			status = pagecache_write_begin(NULL, mapping, pos, bytes,
			
 
				-						AOP_FLAG_UNINTERRUPTIBLE,
			
 
				-						&page, &fsdata);
			
 
				-			if (status)
			
 
				-				break;
			
 
				-
			
 
				-			zero_user(page, offset, bytes);
			
 
				-
			
 
				-			status = pagecache_write_end(NULL, mapping, pos, bytes,
			
 
				-						bytes, page, fsdata);
			
 
				-			WARN_ON(status <= 0); /* can't return less than zero! */
			
 
				-			status = 0;
			
 
				-		}
			
 
				-		pos += bytes;
			
 
				-		count -= bytes;
			
 
				-	} while (count);
			
 
				-
			
 
				-	return status;
			
 
				+	return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
			
 
				 }
			
 
				 
			
 
				 int
			
@@ -282,48 +239,35 @@ xfs_file_fsync(
 
				 }
			
 
				 
			
 
				 STATIC ssize_t
			
 
				-xfs_file_read_iter(
			
 
				+xfs_file_dio_aio_read(
			
 
				 	struct kiocb		*iocb,
			
 
				 	struct iov_iter		*to)
			
 
				 {
			
 
				-	struct file		*file = iocb->ki_filp;
			
 
				-	struct inode		*inode = file->f_mapping->host;
			
 
				+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
			
 
				+	struct inode		*inode = mapping->host;
			
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	size_t			size = iov_iter_count(to);
			
 
				+	loff_t			isize = i_size_read(inode);
			
 
				+	size_t			count = iov_iter_count(to);
			
 
				+	struct iov_iter		data;
			
 
				+	struct xfs_buftarg	*target;
			
 
				 	ssize_t			ret = 0;
			
 
				-	int			ioflags = 0;
			
 
				-	xfs_fsize_t		n;
			
 
				-	loff_t			pos = iocb->ki_pos;
			
 
				 
			
 
				-	XFS_STATS_INC(mp, xs_read_calls);
			
 
				-
			
 
				-	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
			
 
				-		ioflags |= XFS_IO_ISDIRECT;
			
 
				-	if (file->f_mode & FMODE_NOCMTIME)
			
 
				-		ioflags |= XFS_IO_INVIS;
			
 
				-
			
 
				-	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
			
 
				-		xfs_buftarg_t	*target =
			
 
				-			XFS_IS_REALTIME_INODE(ip) ?
			
 
				-				mp->m_rtdev_targp : mp->m_ddev_targp;
			
 
				-		/* DIO must be aligned to device logical sector size */
			
 
				-		if ((pos | size) & target->bt_logical_sectormask) {
			
 
				-			if (pos == i_size_read(inode))
			
 
				-				return 0;
			
 
				-			return -EINVAL;
			
 
				-		}
			
 
				-	}
			
 
				+	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
			
 
				 
			
 
				-	n = mp->m_super->s_maxbytes - pos;
			
 
				-	if (n <= 0 || size == 0)
			
 
				-		return 0;
			
 
				+	if (!count)
			
 
				+		return 0; /* skip atime */
			
 
				 
			
 
				-	if (n < size)
			
 
				-		size = n;
			
 
				+	if (XFS_IS_REALTIME_INODE(ip))
			
 
				+		target = ip->i_mount->m_rtdev_targp;
			
 
				+	else
			
 
				+		target = ip->i_mount->m_ddev_targp;
			
 
				 
			
 
				-	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				-		return -EIO;
			
 
				+	/* DIO must be aligned to device logical sector size */
			
 
				+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
			
 
				+		if (iocb->ki_pos == isize)
			
 
				+			return 0;
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Locking is a bit tricky here. If we take an exclusive lock for direct
			
@@ -336,7 +280,7 @@ xfs_file_read_iter(
 
				 	 * serialisation.
			
 
				 	 */
			
 
				 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
			
 
				-	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
			
 
				+	if (mapping->nrpages) {
			
 
				 		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
			
 
				 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
			
 
				 
			
@@ -351,8 +295,8 @@ xfs_file_read_iter(
 
				 		 * flush and reduce the chances of repeated iolock cycles going
			
 
				 		 * forward.
			
 
				 		 */
			
 
				-		if (inode->i_mapping->nrpages) {
			
 
				-			ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
			
 
				+		if (mapping->nrpages) {
			
 
				+			ret = filemap_write_and_wait(mapping);
			
 
				 			if (ret) {
			
 
				 				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
			
 
				 				return ret;
			
@@ -363,20 +307,95 @@ xfs_file_read_iter(
 
				 			 * we fail to invalidate a page, but this should never
			
 
				 			 * happen on XFS. Warn if it does fail.
			
 
				 			 */
			
 
				-			ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
			
 
				+			ret = invalidate_inode_pages2(mapping);
			
 
				 			WARN_ON_ONCE(ret);
			
 
				 			ret = 0;
			
 
				 		}
			
 
				 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
			
 
				 	}
			
 
				 
			
 
				-	trace_xfs_file_read(ip, size, pos, ioflags);
			
 
				+	data = *to;
			
 
				+	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
			
 
				+			xfs_get_blocks_direct, NULL, NULL, 0);
			
 
				+	if (ret > 0) {
			
 
				+		iocb->ki_pos += ret;
			
 
				+		iov_iter_advance(to, ret);
			
 
				+	}
			
 
				+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
			
 
				 
			
 
				+	file_accessed(iocb->ki_filp);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static noinline ssize_t
			
 
				+xfs_file_dax_read(
			
 
				+	struct kiocb		*iocb,
			
 
				+	struct iov_iter		*to)
			
 
				+{
			
 
				+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
			
 
				+	struct inode		*inode = mapping->host;
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	struct iov_iter		data = *to;
			
 
				+	size_t			count = iov_iter_count(to);
			
 
				+	ssize_t			ret = 0;
			
 
				+
			
 
				+	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
			
 
				+
			
 
				+	if (!count)
			
 
				+		return 0; /* skip atime */
			
 
				+
			
 
				+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
			
 
				+	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
			
 
				+	if (ret > 0) {
			
 
				+		iocb->ki_pos += ret;
			
 
				+		iov_iter_advance(to, ret);
			
 
				+	}
			
 
				+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
			
 
				+
			
 
				+	file_accessed(iocb->ki_filp);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+STATIC ssize_t
			
 
				+xfs_file_buffered_aio_read(
			
 
				+	struct kiocb		*iocb,
			
 
				+	struct iov_iter		*to)
			
 
				+{
			
 
				+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
			
 
				+	ssize_t			ret;
			
 
				+
			
 
				+	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
			
 
				+
			
 
				+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
			
 
				 	ret = generic_file_read_iter(iocb, to);
			
 
				+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+STATIC ssize_t
			
 
				+xfs_file_read_iter(
			
 
				+	struct kiocb		*iocb,
			
 
				+	struct iov_iter		*to)
			
 
				+{
			
 
				+	struct inode		*inode = file_inode(iocb->ki_filp);
			
 
				+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
			
 
				+	ssize_t			ret = 0;
			
 
				+
			
 
				+	XFS_STATS_INC(mp, xs_read_calls);
			
 
				+
			
 
				+	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	if (IS_DAX(inode))
			
 
				+		ret = xfs_file_dax_read(iocb, to);
			
 
				+	else if (iocb->ki_flags & IOCB_DIRECT)
			
 
				+		ret = xfs_file_dio_aio_read(iocb, to);
			
 
				+	else
			
 
				+		ret = xfs_file_buffered_aio_read(iocb, to);
			
 
				+
			
 
				 	if (ret > 0)
			
 
				 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
			
 
				-
			
 
				-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -389,18 +408,14 @@ xfs_file_splice_read(
 
				 	unsigned int		flags)
			
 
				 {
			
 
				 	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
			
 
				-	int			ioflags = 0;
			
 
				 	ssize_t			ret;
			
 
				 
			
 
				 	XFS_STATS_INC(ip->i_mount, xs_read_calls);
			
 
				 
			
 
				-	if (infilp->f_mode & FMODE_NOCMTIME)
			
 
				-		ioflags |= XFS_IO_INVIS;
			
 
				-
			
 
				 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
			
 
				 		return -EIO;
			
 
				 
			
 
				-	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
			
 
				+	trace_xfs_file_splice_read(ip, count, *ppos);
			
 
				 
			
 
				 	/*
			
 
				 	 * DAX inodes cannot ues the page cache for splice, so we have to push
			
@@ -423,49 +438,6 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This routine is called to handle zeroing any space in the last block of the
			
 
				- * file that is beyond the EOF.  We do this since the size is being increased
			
 
				- * without writing anything to that block and we don't want to read the
			
 
				- * garbage on the disk.
			
 
				- */
			
 
				-STATIC int				/* error (positive) */
			
 
				-xfs_zero_last_block(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	xfs_fsize_t		offset,
			
 
				-	xfs_fsize_t		isize,
			
 
				-	bool			*did_zeroing)
			
 
				-{
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	xfs_fileoff_t		last_fsb = XFS_B_TO_FSBT(mp, isize);
			
 
				-	int			zero_offset = XFS_B_FSB_OFFSET(mp, isize);
			
 
				-	int			zero_len;
			
 
				-	int			nimaps = 1;
			
 
				-	int			error = 0;
			
 
				-	struct xfs_bmbt_irec	imap;
			
 
				-
			
 
				-	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-	error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
			
 
				-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				-
			
 
				-	ASSERT(nimaps > 0);
			
 
				-
			
 
				-	/*
			
 
				-	 * If the block underlying isize is just a hole, then there
			
 
				-	 * is nothing to zero.
			
 
				-	 */
			
 
				-	if (imap.br_startblock == HOLESTARTBLOCK)
			
 
				-		return 0;
			
 
				-
			
 
				-	zero_len = mp->m_sb.sb_blocksize - zero_offset;
			
 
				-	if (isize + zero_len > offset)
			
 
				-		zero_len = offset - isize;
			
 
				-	*did_zeroing = true;
			
 
				-	return xfs_iozero(ip, isize, zero_len);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Zero any on disk space between the current EOF and the new, larger EOF.
			
 
				  *
			
@@ -484,94 +456,11 @@ xfs_zero_eof(
 
				 	xfs_fsize_t		isize,		/* current inode size */
			
 
				 	bool			*did_zeroing)
			
 
				 {
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	xfs_fileoff_t		start_zero_fsb;
			
 
				-	xfs_fileoff_t		end_zero_fsb;
			
 
				-	xfs_fileoff_t		zero_count_fsb;
			
 
				-	xfs_fileoff_t		last_fsb;
			
 
				-	xfs_fileoff_t		zero_off;
			
 
				-	xfs_fsize_t		zero_len;
			
 
				-	int			nimaps;
			
 
				-	int			error = 0;
			
 
				-	struct xfs_bmbt_irec	imap;
			
 
				-
			
 
				 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
			
 
				 	ASSERT(offset > isize);
			
 
				 
			
 
				 	trace_xfs_zero_eof(ip, isize, offset - isize);
			
 
				-
			
 
				-	/*
			
 
				-	 * First handle zeroing the block on which isize resides.
			
 
				-	 *
			
 
				-	 * We only zero a part of that block so it is handled specially.
			
 
				-	 */
			
 
				-	if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
			
 
				-		error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Calculate the range between the new size and the old where blocks
			
 
				-	 * needing to be zeroed may exist.
			
 
				-	 *
			
 
				-	 * To get the block where the last byte in the file currently resides,
			
 
				-	 * we need to subtract one from the size and truncate back to a block
			
 
				-	 * boundary.  We subtract 1 in case the size is exactly on a block
			
 
				-	 * boundary.
			
 
				-	 */
			
 
				-	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
			
 
				-	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
			
 
				-	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
			
 
				-	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
			
 
				-	if (last_fsb == end_zero_fsb) {
			
 
				-		/*
			
 
				-		 * The size was only incremented on its last block.
			
 
				-		 * We took care of that above, so just return.
			
 
				-		 */
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	ASSERT(start_zero_fsb <= end_zero_fsb);
			
 
				-	while (start_zero_fsb <= end_zero_fsb) {
			
 
				-		nimaps = 1;
			
 
				-		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
			
 
				-
			
 
				-		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-		error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
			
 
				-					  &imap, &nimaps, 0);
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				-
			
 
				-		ASSERT(nimaps > 0);
			
 
				-
			
 
				-		if (imap.br_state == XFS_EXT_UNWRITTEN ||
			
 
				-		    imap.br_startblock == HOLESTARTBLOCK) {
			
 
				-			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
			
 
				-			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * There are blocks we need to zero.
			
 
				-		 */
			
 
				-		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
			
 
				-		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
			
 
				-
			
 
				-		if ((zero_off + zero_len) > offset)
			
 
				-			zero_len = offset - zero_off;
			
 
				-
			
 
				-		error = xfs_iozero(ip, zero_off, zero_len);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				-
			
 
				-		*did_zeroing = true;
			
 
				-		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
			
 
				-		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				+	return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
 
				 					mp->m_rtdev_targp : mp->m_ddev_targp;
			
 
				 
			
 
				 	/* DIO must be aligned to device logical sector size */
			
 
				-	if (!IS_DAX(inode) &&
			
 
				-	    ((iocb->ki_pos | count) & target->bt_logical_sectormask))
			
 
				+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	/* "unaligned" here means not aligned to a filesystem block */
			
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
 
				 	end = iocb->ki_pos + count - 1;
			
 
				 
			
 
				 	/*
			
 
				-	 * See xfs_file_read_iter() for why we do a full-file flush here.
			
 
				+	 * See xfs_file_dio_aio_read() for why we do a full-file flush here.
			
 
				 	 */
			
 
				 	if (mapping->nrpages) {
			
 
				 		ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
			
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
 
				 		iolock = XFS_IOLOCK_SHARED;
			
 
				 	}
			
 
				 
			
 
				-	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
			
 
				+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
			
 
				 
			
 
				 	data = *from;
			
 
				-	ret = mapping->a_ops->direct_IO(iocb, &data);
			
 
				+	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
			
 
				+			xfs_get_blocks_direct, xfs_end_io_direct_write,
			
 
				+			NULL, DIO_ASYNC_EXTEND);
			
 
				 
			
 
				 	/* see generic_file_direct_write() for why this is necessary */
			
 
				 	if (mapping->nrpages) {
			
@@ -809,10 +699,70 @@ out:
 
				 	xfs_rw_iunlock(ip, iolock);
			
 
				 
			
 
				 	/*
			
 
				-	 * No fallback to buffered IO on errors for XFS. DAX can result in
			
 
				-	 * partial writes, but direct IO will either complete fully or fail.
			
 
				+	 * No fallback to buffered IO on errors for XFS, direct IO will either
			
 
				+	 * complete fully or fail.
			
 
				 	 */
			
 
				-	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
			
 
				+	ASSERT(ret < 0 || ret == count);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static noinline ssize_t
			
 
				+xfs_file_dax_write(
			
 
				+	struct kiocb		*iocb,
			
 
				+	struct iov_iter		*from)
			
 
				+{
			
 
				+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
			
 
				+	struct inode		*inode = mapping->host;
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	ssize_t			ret = 0;
			
 
				+	int			unaligned_io = 0;
			
 
				+	int			iolock;
			
 
				+	struct iov_iter		data;
			
 
				+
			
 
				+	/* "unaligned" here means not aligned to a filesystem block */
			
 
				+	if ((iocb->ki_pos & mp->m_blockmask) ||
			
 
				+	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
			
 
				+		unaligned_io = 1;
			
 
				+		iolock = XFS_IOLOCK_EXCL;
			
 
				+	} else if (mapping->nrpages) {
			
 
				+		iolock = XFS_IOLOCK_EXCL;
			
 
				+	} else {
			
 
				+		iolock = XFS_IOLOCK_SHARED;
			
 
				+	}
			
 
				+	xfs_rw_ilock(ip, iolock);
			
 
				+
			
 
				+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				+
			
 
				+	/*
			
 
				+	 * Yes, even DAX files can have page cache attached to them:  A zeroed
			
 
				+	 * page is inserted into the pagecache when we have to serve a write
			
 
				+	 * fault on a hole.  It should never be dirtied and can simply be
			
 
				+	 * dropped from the pagecache once we get real data for the page.
			
 
				+	 */
			
 
				+	if (mapping->nrpages) {
			
 
				+		ret = invalidate_inode_pages2(mapping);
			
 
				+		WARN_ON_ONCE(ret);
			
 
				+	}
			
 
				+
			
 
				+	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
			
 
				+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
			
 
				+		iolock = XFS_IOLOCK_SHARED;
			
 
				+	}
			
 
				+
			
 
				+	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
			
 
				+
			
 
				+	data = *from;
			
 
				+	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
			
 
				+			xfs_end_io_direct_write, 0);
			
 
				+	if (ret > 0) {
			
 
				+		iocb->ki_pos += ret;
			
 
				+		iov_iter_advance(from, ret);
			
 
				+	}
			
 
				+out:
			
 
				+	xfs_rw_iunlock(ip, iolock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
 
				 	current->backing_dev_info = inode_to_bdi(inode);
			
 
				 
			
 
				 write_retry:
			
 
				-	trace_xfs_file_buffered_write(ip, iov_iter_count(from),
			
 
				-				      iocb->ki_pos, 0);
			
 
				-	ret = generic_perform_write(file, from, iocb->ki_pos);
			
 
				+	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
			
 
				+	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
			
 
				 	if (likely(ret >= 0))
			
 
				 		iocb->ki_pos += ret;
			
 
				 
			
@@ -895,7 +844,9 @@ xfs_file_write_iter(
 
				 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
			
 
				 		return -EIO;
			
 
				 
			
 
				-	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
			
 
				+	if (IS_DAX(inode))
			
 
				+		ret = xfs_file_dax_write(iocb, from);
			
 
				+	else if (iocb->ki_flags & IOCB_DIRECT)
			
 
				 		ret = xfs_file_dio_aio_write(iocb, from);
			
 
				 	else
			
 
				 		ret = xfs_file_buffered_aio_write(iocb, from);
			
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
 
				 	if (IS_DAX(inode)) {
			
 
				 		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
			
 
				 	} else {
			
 
				-		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
			
 
				+		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
			
 
				 		ret = block_page_mkwrite_return(ret);
			
 
				 	}
			
 
				 
			
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
 
				 	__uint64_t              *inval,
			
 
				 	xfs_fsop_resblks_t      *outval)
			
 
				 {
			
 
				-	__int64_t		lcounter, delta, fdblks_delta;
			
 
				+	__int64_t		lcounter, delta;
			
 
				+	__int64_t		fdblks_delta = 0;
			
 
				 	__uint64_t		request;
			
 
				+	__int64_t		free;
			
 
				+	int			error = 0;
			
 
				 
			
 
				 	/* If inval is null, report current values and return */
			
 
				 	if (inval == (__uint64_t *)NULL) {
			
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
 
				 	request = *inval;
			
 
				 
			
 
				 	/*
			
 
				-	 * With per-cpu counters, this becomes an interesting
			
 
				-	 * problem. we needto work out if we are freeing or allocation
			
 
				-	 * blocks first, then we can do the modification as necessary.
			
 
				+	 * With per-cpu counters, this becomes an interesting problem. we need
			
 
				+	 * to work out if we are freeing or allocation blocks first, then we can
			
 
				+	 * do the modification as necessary.
			
 
				 	 *
			
 
				-	 * We do this under the m_sb_lock so that if we are near
			
 
				-	 * ENOSPC, we will hold out any changes while we work out
			
 
				-	 * what to do. This means that the amount of free space can
			
 
				-	 * change while we do this, so we need to retry if we end up
			
 
				-	 * trying to reserve more space than is available.
			
 
				+	 * We do this under the m_sb_lock so that if we are near ENOSPC, we will
			
 
				+	 * hold out any changes while we work out what to do. This means that
			
 
				+	 * the amount of free space can change while we do this, so we need to
			
 
				+	 * retry if we end up trying to reserve more space than is available.
			
 
				 	 */
			
 
				-retry:
			
 
				 	spin_lock(&mp->m_sb_lock);
			
 
				 
			
 
				 	/*
			
 
				 	 * If our previous reservation was larger than the current value,
			
 
				-	 * then move any unused blocks back to the free pool.
			
 
				+	 * then move any unused blocks back to the free pool. Modify the resblks
			
 
				+	 * counters directly since we shouldn't have any problems unreserving
			
 
				+	 * space.
			
 
				 	 */
			
 
				-	fdblks_delta = 0;
			
 
				 	if (mp->m_resblks > request) {
			
 
				 		lcounter = mp->m_resblks_avail - request;
			
 
				 		if (lcounter  > 0) {		/* release unused blocks */
			
@@ -707,54 +709,67 @@ retry:
 
				 			mp->m_resblks_avail -= lcounter;
			
 
				 		}
			
 
				 		mp->m_resblks = request;
			
 
				-	} else {
			
 
				-		__int64_t	free;
			
 
				+		if (fdblks_delta) {
			
 
				+			spin_unlock(&mp->m_sb_lock);
			
 
				+			error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
			
 
				+			spin_lock(&mp->m_sb_lock);
			
 
				+		}
			
 
				+
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				+	/*
			
 
				+	 * If the request is larger than the current reservation, reserve the
			
 
				+	 * blocks before we update the reserve counters. Sample m_fdblocks and
			
 
				+	 * perform a partial reservation if the request exceeds free space.
			
 
				+	 */
			
 
				+	error = -ENOSPC;
			
 
				+	do {
			
 
				 		free = percpu_counter_sum(&mp->m_fdblocks) -
			
 
				 							XFS_ALLOC_SET_ASIDE(mp);
			
 
				 		if (!free)
			
 
				-			goto out; /* ENOSPC and fdblks_delta = 0 */
			
 
				+			break;
			
 
				 
			
 
				 		delta = request - mp->m_resblks;
			
 
				 		lcounter = free - delta;
			
 
				-		if (lcounter < 0) {
			
 
				+		if (lcounter < 0)
			
 
				 			/* We can't satisfy the request, just get what we can */
			
 
				-			mp->m_resblks += free;
			
 
				-			mp->m_resblks_avail += free;
			
 
				-			fdblks_delta = -free;
			
 
				-		} else {
			
 
				-			fdblks_delta = -delta;
			
 
				-			mp->m_resblks = request;
			
 
				-			mp->m_resblks_avail += delta;
			
 
				-		}
			
 
				-	}
			
 
				-out:
			
 
				-	if (outval) {
			
 
				-		outval->resblks = mp->m_resblks;
			
 
				-		outval->resblks_avail = mp->m_resblks_avail;
			
 
				-	}
			
 
				-	spin_unlock(&mp->m_sb_lock);
			
 
				+			fdblks_delta = free;
			
 
				+		else
			
 
				+			fdblks_delta = delta;
			
 
				 
			
 
				-	if (fdblks_delta) {
			
 
				 		/*
			
 
				-		 * If we are putting blocks back here, m_resblks_avail is
			
 
				-		 * already at its max so this will put it in the free pool.
			
 
				-		 *
			
 
				-		 * If we need space, we'll either succeed in getting it
			
 
				-		 * from the free block count or we'll get an enospc. If
			
 
				-		 * we get a ENOSPC, it means things changed while we were
			
 
				-		 * calculating fdblks_delta and so we should try again to
			
 
				-		 * see if there is anything left to reserve.
			
 
				+		 * We'll either succeed in getting space from the free block
			
 
				+		 * count or we'll get an ENOSPC. If we get a ENOSPC, it means
			
 
				+		 * things changed while we were calculating fdblks_delta and so
			
 
				+		 * we should try again to see if there is anything left to
			
 
				+		 * reserve.
			
 
				 		 *
			
 
				 		 * Don't set the reserved flag here - we don't want to reserve
			
 
				 		 * the extra reserve blocks from the reserve.....
			
 
				 		 */
			
 
				-		int error;
			
 
				-		error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
			
 
				-		if (error == -ENOSPC)
			
 
				-			goto retry;
			
 
				+		spin_unlock(&mp->m_sb_lock);
			
 
				+		error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
			
 
				+		spin_lock(&mp->m_sb_lock);
			
 
				+	} while (error == -ENOSPC);
			
 
				+
			
 
				+	/*
			
 
				+	 * Update the reserve counters if blocks have been successfully
			
 
				+	 * allocated.
			
 
				+	 */
			
 
				+	if (!error && fdblks_delta) {
			
 
				+		mp->m_resblks += fdblks_delta;
			
 
				+		mp->m_resblks_avail += fdblks_delta;
			
 
				 	}
			
 
				-	return 0;
			
 
				+
			
 
				+out:
			
 
				+	if (outval) {
			
 
				+		outval->resblks = mp->m_resblks;
			
 
				+		outval->resblks_avail = mp->m_resblks_avail;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&mp->m_sb_lock);
			
 
				+	return error;
			
 
				 }
			
 
				 
			
 
				 int
			
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -765,7 +765,7 @@ restart:
 
				  * Background scanning to trim post-EOF preallocated space. This is queued
			
 
				  * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
			
 
				  */
			
 
				-STATIC void
			
 
				+void
			
 
				 xfs_queue_eofblocks(
			
 
				 	struct xfs_mount *mp)
			
 
				 {
			
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 
				 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
			
 
				 int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
			
 
				 void xfs_eofblocks_worker(struct work_struct *);
			
 
				+void xfs_queue_eofblocks(struct xfs_mount *);
			
 
				 
			
 
				 int xfs_inode_ag_iterator(struct xfs_mount *mp,
			
 
				 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
			
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 
				  * lock more than one at a time, lockdep will report false positives saying we
			
 
				  * have violated locking orders.
			
 
				  */
			
 
				-void
			
 
				+static void
			
 
				 xfs_lock_inodes(
			
 
				 	xfs_inode_t	**ips,
			
 
				 	int		inodes,
			
@@ -667,14 +667,6 @@ xfs_ip2xflags(
 
				 	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
			
 
				 }
			
 
				 
			
 
				-uint
			
 
				-xfs_dic2xflags(
			
 
				-	struct xfs_dinode	*dip)
			
 
				-{
			
 
				-	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
			
 
				-				be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
			
 
				  * is allowed, otherwise it has to be an exact match. If a CI match is found,
			
@@ -748,7 +740,7 @@ out_unlock:
 
				  * are not linked into the directory structure - they are attached
			
 
				  * directly to the superblock - and so have no parent.
			
 
				  */
			
 
				-int
			
 
				+static int
			
 
				 xfs_ialloc(
			
 
				 	xfs_trans_t	*tp,
			
 
				 	xfs_inode_t	*pip,
			
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
 
				  * link count to go to zero, move the inode to AGI unlinked list so that it can
			
 
				  * be freed when the last active reference goes away via xfs_inactive().
			
 
				  */
			
 
				-int				/* error */
			
 
				+static int			/* error */
			
 
				 xfs_droplink(
			
 
				 	xfs_trans_t *tp,
			
 
				 	xfs_inode_t *ip)
			
@@ -1104,7 +1096,7 @@ xfs_droplink(
 
				 /*
			
 
				  * Increment the link count on an inode & log the change.
			
 
				  */
			
 
				-int
			
 
				+static int
			
 
				 xfs_bumplink(
			
 
				 	xfs_trans_t *tp,
			
 
				 	xfs_inode_t *ip)
			
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -395,12 +395,8 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 
				 int		xfs_isilocked(xfs_inode_t *, uint);
			
 
				 uint		xfs_ilock_data_map_shared(struct xfs_inode *);
			
 
				 uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
			
 
				-int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
			
 
				-			   xfs_nlink_t, xfs_dev_t, prid_t, int,
			
 
				-			   struct xfs_buf **, xfs_inode_t **);
			
 
				 
			
 
				 uint		xfs_ip2xflags(struct xfs_inode *);
			
 
				-uint		xfs_dic2xflags(struct xfs_dinode *);
			
 
				 int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
			
 
				 			   struct xfs_bmap_free *);
			
 
				 int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
			
@@ -411,7 +407,6 @@ void		xfs_iunpin_wait(xfs_inode_t *);
 
				 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
			
 
				 
			
 
				 int		xfs_iflush(struct xfs_inode *, struct xfs_buf **);
			
 
				-void		xfs_lock_inodes(xfs_inode_t **, int, uint);
			
 
				 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
			
 
				 
			
 
				 xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
			
@@ -419,8 +414,6 @@ xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
 
				 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
			
 
				 			       xfs_nlink_t, xfs_dev_t, prid_t, int,
			
 
				 			       struct xfs_inode **, int *);
			
 
				-int		xfs_droplink(struct xfs_trans *, struct xfs_inode *);
			
 
				-int		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
			
 
				 
			
 
				 /* from xfs_file.c */
			
 
				 enum xfs_prealloc_flags {
			
@@ -434,7 +427,8 @@ int	xfs_update_prealloc_flags(struct xfs_inode *ip,
 
				 				  enum xfs_prealloc_flags flags);
			
 
				 int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
			
 
				 		     xfs_fsize_t isize, bool *did_zeroing);
			
 
				-int	xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
			
 
				+int	xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
			
 
				+		bool *did_zero);
			
 
				 loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
			
 
				 			     loff_t eof, int whence);
			
 
				 
			
@@ -479,14 +473,4 @@ do { \
 
				 
			
 
				 extern struct kmem_zone	*xfs_inode_zone;
			
 
				 
			
 
				-/*
			
 
				- * Flags for read/write calls
			
 
				- */
			
 
				-#define XFS_IO_ISDIRECT	0x00001		/* bypass page cache */
			
 
				-#define XFS_IO_INVIS	0x00002		/* don't update inode timestamps */
			
 
				-
			
 
				-#define XFS_IO_FLAGS \
			
 
				-	{ XFS_IO_ISDIRECT,	"DIRECT" }, \
			
 
				-	{ XFS_IO_INVIS,		"INVIS"}
			
 
				-
			
 
				 #endif	/* __XFS_INODE_H__ */
			
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
 
				 xfs_inode_item_destroy(
			
 
				 	xfs_inode_t	*ip)
			
 
				 {
			
 
				+	kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
			
 
				 	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
			
 
				 }
			
 
				 
			
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
 
				 
			
 
				 int
			
 
				 xfs_ioc_space(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	struct inode		*inode,
			
 
				 	struct file		*filp,
			
 
				-	int			ioflags,
			
 
				 	unsigned int		cmd,
			
 
				 	xfs_flock64_t		*bf)
			
 
				 {
			
 
				+	struct inode		*inode = file_inode(filp);
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				 	struct iattr		iattr;
			
 
				 	enum xfs_prealloc_flags	flags = 0;
			
 
				 	uint			iolock = XFS_IOLOCK_EXCL;
			
@@ -626,7 +625,7 @@ xfs_ioc_space(
 
				 
			
 
				 	if (filp->f_flags & O_DSYNC)
			
 
				 		flags |= XFS_PREALLOC_SYNC;
			
 
				-	if (ioflags & XFS_IO_INVIS)
			
 
				+	if (filp->f_mode & FMODE_NOCMTIME)
			
 
				 		flags |= XFS_PREALLOC_INVISIBLE;
			
 
				 
			
 
				 	error = mnt_want_write_file(filp);
			
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 
				 
			
 
				 STATIC int
			
 
				 xfs_ioc_getbmap(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	int			ioflags,
			
 
				+	struct file		*file,
			
 
				 	unsigned int		cmd,
			
 
				 	void			__user *arg)
			
 
				 {
			
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
 
				 		return -EINVAL;
			
 
				 
			
 
				 	bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
			
 
				-	if (ioflags & XFS_IO_INVIS)
			
 
				+	if (file->f_mode & FMODE_NOCMTIME)
			
 
				 		bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
			
 
				 
			
 
				-	error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
			
 
				+	error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
			
 
				 			    (__force struct getbmap *)arg+1);
			
 
				 	if (error)
			
 
				 		return error;
			
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
 
				 		goto out_put_tmp_file;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * We need to ensure that the fds passed in point to XFS inodes
			
 
				+	 * before we cast and access them as XFS structures as we have no
			
 
				+	 * control over what the user passes us here.
			
 
				+	 */
			
 
				 	if (f.file->f_op != &xfs_file_operations ||
			
 
				 	    tmp.file->f_op != &xfs_file_operations) {
			
 
				 		error = -EINVAL;
			
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				 	struct xfs_mount	*mp = ip->i_mount;
			
 
				 	void			__user *arg = (void __user *)p;
			
 
				-	int			ioflags = 0;
			
 
				 	int			error;
			
 
				 
			
 
				-	if (filp->f_mode & FMODE_NOCMTIME)
			
 
				-		ioflags |= XFS_IO_INVIS;
			
 
				-
			
 
				 	trace_xfs_file_ioctl(ip);
			
 
				 
			
 
				 	switch (cmd) {
			
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
 
				 
			
 
				 		if (copy_from_user(&bf, arg, sizeof(bf)))
			
 
				 			return -EFAULT;
			
 
				-		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
			
 
				+		return xfs_ioc_space(filp, cmd, &bf);
			
 
				 	}
			
 
				 	case XFS_IOC_DIOINFO: {
			
 
				 		struct dioattr	da;
			
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
 
				 
			
 
				 	case XFS_IOC_GETBMAP:
			
 
				 	case XFS_IOC_GETBMAPA:
			
 
				-		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
			
 
				+		return xfs_ioc_getbmap(filp, cmd, arg);
			
 
				 
			
 
				 	case XFS_IOC_GETBMAPX:
			
 
				 		return xfs_ioc_getbmapx(ip, arg);
			
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
 
				 
			
 
				 extern int
			
 
				 xfs_ioc_space(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	struct inode		*inode,
			
 
				 	struct file		*filp,
			
 
				-	int			ioflags,
			
 
				 	unsigned int		cmd,
			
 
				 	xfs_flock64_t		*bf);
			
 
				 
			
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				 	struct xfs_mount	*mp = ip->i_mount;
			
 
				 	void			__user *arg = (void __user *)p;
			
 
				-	int			ioflags = 0;
			
 
				 	int			error;
			
 
				 
			
 
				-	if (filp->f_mode & FMODE_NOCMTIME)
			
 
				-		ioflags |= XFS_IO_INVIS;
			
 
				-
			
 
				 	trace_xfs_file_compat_ioctl(ip);
			
 
				 
			
 
				 	switch (cmd) {
			
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
 
				 		if (xfs_compat_flock64_copyin(&bf, arg))
			
 
				 			return -EFAULT;
			
 
				 		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
			
 
				-		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
			
 
				+		return xfs_ioc_space(filp, cmd, &bf);
			
 
				 	}
			
 
				 	case XFS_IOC_FSGEOMETRY_V1_32:
			
 
				 		return xfs_compat_ioc_fsgeometry_v1(mp, arg);
			
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
 
				  * along with this program; if not, write the Free Software Foundation,
			
 
				  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				  */
			
 
				+#include <linux/iomap.h>
			
 
				 #include "xfs.h"
			
 
				 #include "xfs_fs.h"
			
 
				 #include "xfs_shared.h"
			
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
 
				 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				 	return error;
			
 
				 }
			
 
				+
			
 
				+void
			
 
				+xfs_bmbt_to_iomap(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	struct iomap		*iomap,
			
 
				+	struct xfs_bmbt_irec	*imap)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+
			
 
				+	if (imap->br_startblock == HOLESTARTBLOCK) {
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->type = IOMAP_HOLE;
			
 
				+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->type = IOMAP_DELALLOC;
			
 
				+	} else {
			
 
				+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
			
 
				+		if (imap->br_state == XFS_EXT_UNWRITTEN)
			
 
				+			iomap->type = IOMAP_UNWRITTEN;
			
 
				+		else
			
 
				+			iomap->type = IOMAP_MAPPED;
			
 
				+	}
			
 
				+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
			
 
				+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
			
 
				+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
			
 
				+}
			
 
				+
			
 
				+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
			
 
				+{
			
 
				+	return !nimaps ||
			
 
				+		imap->br_startblock == HOLESTARTBLOCK ||
			
 
				+		imap->br_startblock == DELAYSTARTBLOCK;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_file_iomap_begin(
			
 
				+	struct inode		*inode,
			
 
				+	loff_t			offset,
			
 
				+	loff_t			length,
			
 
				+	unsigned		flags,
			
 
				+	struct iomap		*iomap)
			
 
				+{
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_bmbt_irec	imap;
			
 
				+	xfs_fileoff_t		offset_fsb, end_fsb;
			
 
				+	int			nimaps = 1, error = 0;
			
 
				+
			
 
				+	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+
			
 
				+	ASSERT(offset <= mp->m_super->s_maxbytes);
			
 
				+	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
			
 
				+		length = mp->m_super->s_maxbytes - offset;
			
 
				+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
			
 
				+
			
 
				+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
			
 
				+			       &nimaps, XFS_BMAPI_ENTIRE);
			
 
				+	if (error) {
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		return error;
			
 
				+	}
			
 
				+
			
 
				+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
			
 
				+		/*
			
 
				+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
			
 
				+		 * pages to keep the chunks of work done where somewhat symmetric
			
 
				+		 * with the work writeback does. This is a completely arbitrary
			
 
				+		 * number pulled out of thin air as a best guess for initial
			
 
				+		 * testing.
			
 
				+		 *
			
 
				+		 * Note that the values needs to be less than 32-bits wide until
			
 
				+		 * the lower level functions are updated.
			
 
				+		 */
			
 
				+		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
			
 
				+		if (xfs_get_extsz_hint(ip)) {
			
 
				+			/*
			
 
				+			 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				+			 * is unlocked on return.
			
 
				+			 */
			
 
				+			xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
			
 
				+			error = xfs_iomap_write_direct(ip, offset, length, &imap,
			
 
				+					nimaps);
			
 
				+		} else {
			
 
				+			error = xfs_iomap_write_delay(ip, offset, length, &imap);
			
 
				+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		}
			
 
				+
			
 
				+		if (error)
			
 
				+			return error;
			
 
				+
			
 
				+		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
			
 
				+		xfs_bmbt_to_iomap(ip, iomap, &imap);
			
 
				+	} else if (nimaps) {
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
			
 
				+		xfs_bmbt_to_iomap(ip, iomap, &imap);
			
 
				+	} else {
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->type = IOMAP_HOLE;
			
 
				+		iomap->offset = offset;
			
 
				+		iomap->length = length;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_file_iomap_end_delalloc(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	loff_t			offset,
			
 
				+	loff_t			length,
			
 
				+	ssize_t			written)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	xfs_fileoff_t		start_fsb;
			
 
				+	xfs_fileoff_t		end_fsb;
			
 
				+	int			error = 0;
			
 
				+
			
 
				+	start_fsb = XFS_B_TO_FSB(mp, offset + written);
			
 
				+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
			
 
				+
			
 
				+	/*
			
 
				+	 * Trim back delalloc blocks if we didn't manage to write the whole
			
 
				+	 * range reserved.
			
 
				+	 *
			
 
				+	 * We don't need to care about racing delalloc as we hold i_mutex
			
 
				+	 * across the reserve/allocate/unreserve calls. If there are delalloc
			
 
				+	 * blocks in the range, they are ours.
			
 
				+	 */
			
 
				+	if (start_fsb < end_fsb) {
			
 
				+		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
			
 
				+					       end_fsb - start_fsb);
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+
			
 
				+		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
			
 
				+			xfs_alert(mp, "%s: unable to clean up ino %lld",
			
 
				+				__func__, ip->i_ino);
			
 
				+			return error;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_file_iomap_end(
			
 
				+	struct inode		*inode,
			
 
				+	loff_t			offset,
			
 
				+	loff_t			length,
			
 
				+	ssize_t			written,
			
 
				+	unsigned		flags,
			
 
				+	struct iomap		*iomap)
			
 
				+{
			
 
				+	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
			
 
				+		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
			
 
				+				length, written);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+struct iomap_ops xfs_iomap_ops = {
			
 
				+	.iomap_begin		= xfs_file_iomap_begin,
			
 
				+	.iomap_end		= xfs_file_iomap_end,
			
 
				+};
			
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
 
				 #ifndef __XFS_IOMAP_H__
			
 
				 #define __XFS_IOMAP_H__
			
 
				 
			
 
				+#include <linux/iomap.h>
			
 
				+
			
 
				 struct xfs_inode;
			
 
				 struct xfs_bmbt_irec;
			
 
				 
			
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 
				 			struct xfs_bmbt_irec *);
			
 
				 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
			
 
				 
			
 
				+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
			
 
				+		struct xfs_bmbt_irec *);
			
 
				+
			
 
				+extern struct iomap_ops xfs_iomap_ops;
			
 
				+
			
 
				 #endif /* __XFS_IOMAP_H__*/
			
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
 
				 #include "xfs_dir2.h"
			
 
				 #include "xfs_trans_space.h"
			
 
				 #include "xfs_pnfs.h"
			
 
				+#include "xfs_iomap.h"
			
 
				 
			
 
				 #include <linux/capability.h>
			
 
				 #include <linux/xattr.h>
			
 
				 #include <linux/posix_acl.h>
			
 
				 #include <linux/security.h>
			
 
				-#include <linux/fiemap.h>
			
 
				+#include <linux/iomap.h>
			
 
				 #include <linux/slab.h>
			
 
				 
			
 
				 /*
			
@@ -800,21 +801,31 @@ xfs_setattr_size(
 
				 	if (error)
			
 
				 		return error;
			
 
				 
			
 
				+	/*
			
 
				+	 * Wait for all direct I/O to complete.
			
 
				+	 */
			
 
				+	inode_dio_wait(inode);
			
 
				+
			
 
				 	/*
			
 
				 	 * File data changes must be complete before we start the transaction to
			
 
				 	 * modify the inode.  This needs to be done before joining the inode to
			
 
				 	 * the transaction because the inode cannot be unlocked once it is a
			
 
				 	 * part of the transaction.
			
 
				 	 *
			
 
				-	 * Start with zeroing any data block beyond EOF that we may expose on
			
 
				-	 * file extension.
			
 
				+	 * Start with zeroing any data beyond EOF that we may expose on file
			
 
				+	 * extension, or zeroing out the rest of the block on a downward
			
 
				+	 * truncate.
			
 
				 	 */
			
 
				 	if (newsize > oldsize) {
			
 
				 		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				+	} else {
			
 
				+		error = iomap_truncate_page(inode, newsize, &did_zeroing,
			
 
				+				&xfs_iomap_ops);
			
 
				 	}
			
 
				 
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+
			
 
				 	/*
			
 
				 	 * We are going to log the inode size change in this transaction so
			
 
				 	 * any previous writes that are beyond the on disk EOF and the new
			
@@ -823,17 +834,14 @@ xfs_setattr_size(
 
				 	 * problem. Note that this includes any block zeroing we did above;
			
 
				 	 * otherwise those blocks may not be zeroed after a crash.
			
 
				 	 */
			
 
				-	if (newsize > ip->i_d.di_size &&
			
 
				-	    (oldsize != ip->i_d.di_size || did_zeroing)) {
			
 
				+	if (did_zeroing ||
			
 
				+	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
			
 
				 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
			
 
				 						      ip->i_d.di_size, newsize);
			
 
				 		if (error)
			
 
				 			return error;
			
 
				 	}
			
 
				 
			
 
				-	/* Now wait for all direct I/O to complete. */
			
 
				-	inode_dio_wait(inode);
			
 
				-
			
 
				 	/*
			
 
				 	 * We've already locked out new page faults, so now we can safely remove
			
 
				 	 * pages from the page cache knowing they won't get refaulted until we
			
@@ -851,13 +859,6 @@ xfs_setattr_size(
 
				 	 * to hope that the caller sees ENOMEM and retries the truncate
			
 
				 	 * operation.
			
 
				 	 */
			
 
				-	if (IS_DAX(inode))
			
 
				-		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
			
 
				-	else
			
 
				-		error = block_truncate_page(inode->i_mapping, newsize,
			
 
				-					    xfs_get_blocks);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				 	truncate_setsize(inode, newsize);
			
 
				 
			
 
				 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
			
@@ -998,51 +999,6 @@ xfs_vn_update_time(
 
				 	return xfs_trans_commit(tp);
			
 
				 }
			
 
				 
			
 
				-#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
			
 
				-
			
 
				-/*
			
 
				- * Call fiemap helper to fill in user data.
			
 
				- * Returns positive errors to xfs_getbmap.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_fiemap_format(
			
 
				-	void			**arg,
			
 
				-	struct getbmapx		*bmv,
			
 
				-	int			*full)
			
 
				-{
			
 
				-	int			error;
			
 
				-	struct fiemap_extent_info *fieinfo = *arg;
			
 
				-	u32			fiemap_flags = 0;
			
 
				-	u64			logical, physical, length;
			
 
				-
			
 
				-	/* Do nothing for a hole */
			
 
				-	if (bmv->bmv_block == -1LL)
			
 
				-		return 0;
			
 
				-
			
 
				-	logical = BBTOB(bmv->bmv_offset);
			
 
				-	physical = BBTOB(bmv->bmv_block);
			
 
				-	length = BBTOB(bmv->bmv_length);
			
 
				-
			
 
				-	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
			
 
				-		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
			
 
				-	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
			
 
				-		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
			
 
				-				 FIEMAP_EXTENT_UNKNOWN);
			
 
				-		physical = 0;   /* no block yet */
			
 
				-	}
			
 
				-	if (bmv->bmv_oflags & BMV_OF_LAST)
			
 
				-		fiemap_flags |= FIEMAP_EXTENT_LAST;
			
 
				-
			
 
				-	error = fiemap_fill_next_extent(fieinfo, logical, physical,
			
 
				-					length, fiemap_flags);
			
 
				-	if (error > 0) {
			
 
				-		error = 0;
			
 
				-		*full = 1;	/* user array now full */
			
 
				-	}
			
 
				-
			
 
				-	return error;
			
 
				-}
			
 
				-
			
 
				 STATIC int
			
 
				 xfs_vn_fiemap(
			
 
				 	struct inode		*inode,
			
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
 
				 	u64			start,
			
 
				 	u64			length)
			
 
				 {
			
 
				-	xfs_inode_t		*ip = XFS_I(inode);
			
 
				-	struct getbmapx		bm;
			
 
				 	int			error;
			
 
				 
			
 
				-	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				-
			
 
				-	/* Set up bmap header for xfs internal routine */
			
 
				-	bm.bmv_offset = BTOBBT(start);
			
 
				-	/* Special case for whole file */
			
 
				-	if (length == FIEMAP_MAX_OFFSET)
			
 
				-		bm.bmv_length = -1LL;
			
 
				-	else
			
 
				-		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
			
 
				-
			
 
				-	/* We add one because in getbmap world count includes the header */
			
 
				-	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
			
 
				-					fieinfo->fi_extents_max + 1;
			
 
				-	bm.bmv_count = min_t(__s32, bm.bmv_count,
			
 
				-			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
			
 
				-	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
			
 
				-	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
			
 
				-		bm.bmv_iflags |= BMV_IF_ATTRFORK;
			
 
				-	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
			
 
				-		bm.bmv_iflags |= BMV_IF_DELALLOC;
			
 
				-
			
 
				-	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				+	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
			
 
				+	error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
			
 
				+	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
			
 
				 
			
 
				-	return 0;
			
 
				+	return error;
			
 
				 }
			
 
				 
			
 
				 STATIC int
			
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 
				 	return x;
			
 
				 }
			
 
				 
			
 
				-/* ARM old ABI has some weird alignment/padding */
			
 
				-#if defined(__arm__) && !defined(__ARM_EABI__)
			
 
				-#define __arch_pack __attribute__((packed))
			
 
				-#else
			
 
				-#define __arch_pack
			
 
				-#endif
			
 
				-
			
 
				 #define ASSERT_ALWAYS(expr)	\
			
 
				 	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
			
 
				 
			
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
 
				  * As far as I know, there weren't any dependencies on the old behaviour.
			
 
				  */
			
 
				 
			
 
				-int
			
 
				+static int
			
 
				 xfs_log_unmount_write(xfs_mount_t *mp)
			
 
				 {
			
 
				 	struct xlog	 *log = mp->m_log;
			
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
 
				  * there's no point in running a dummy transaction at this point because we
			
 
				  * can't start trying to idle the log until both the CIL and AIL are empty.
			
 
				  */
			
 
				-int
			
 
				+static int
			
 
				 xfs_log_need_covered(xfs_mount_t *mp)
			
 
				 {
			
 
				 	struct xlog	*log = mp->m_log;
			
@@ -1177,7 +1177,7 @@ xlog_space_left(
 
				  * The log manager needs its own routine, in order to control what
			
 
				  * happens with the buffer after the write completes.
			
 
				  */
			
 
				-void
			
 
				+static void
			
 
				 xlog_iodone(xfs_buf_t *bp)
			
 
				 {
			
 
				 	struct xlog_in_core	*iclog = bp->b_fspriv;
			
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
 
				  * disk. If there is nothing dirty, then we might need to cover the log to
			
 
				  * indicate that the filesystem is idle.
			
 
				  */
			
 
				-void
			
 
				+static void
			
 
				 xfs_log_worker(
			
 
				 	struct work_struct	*work)
			
 
				 {
			
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
 
				 	 */
			
 
				 	error = -ENOMEM;
			
 
				 	bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
			
 
				-			   BTOBB(log->l_iclog_size), 0);
			
 
				+			   BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
			
 
				 	if (!bp)
			
 
				 		goto out_free_log;
			
 
				 
			
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
 
				 		prev_iclog = iclog;
			
 
				 
			
 
				 		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
			
 
				-						BTOBB(log->l_iclog_size), 0);
			
 
				+					  BTOBB(log->l_iclog_size),
			
 
				+					  XBF_NO_IOACCT);
			
 
				 		if (!bp)
			
 
				 			goto out_free_iclog;
			
 
				 
			
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -163,12 +163,8 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
 
				 			  __uint8_t	   clientid,
			
 
				 			  bool		   permanent);
			
 
				 int	  xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
			
 
				-int	  xfs_log_unmount_write(struct xfs_mount *mp);
			
 
				 void      xfs_log_unmount(struct xfs_mount *mp);
			
 
				 int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
			
 
				-int	  xfs_log_need_covered(struct xfs_mount *mp);
			
 
				-
			
 
				-void	  xlog_iodone(struct xfs_buf *);
			
 
				 
			
 
				 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
			
 
				 void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
			
@@ -178,7 +174,6 @@ void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 
				 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
			
 
				 
			
 
				 void	xfs_log_work_queue(struct xfs_mount *mp);
			
 
				-void	xfs_log_worker(struct work_struct *work);
			
 
				 void	xfs_log_quiesce(struct xfs_mount *mp);
			
 
				 bool	xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
			
 
				 
			
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
 
				 	log->l_cilp->xc_ctx->sequence = 1;
			
 
				 }
			
 
				 
			
 
				+static inline int
			
 
				+xlog_cil_iovec_space(
			
 
				+	uint	niovecs)
			
 
				+{
			
 
				+	return round_up((sizeof(struct xfs_log_vec) +
			
 
				+					niovecs * sizeof(struct xfs_log_iovec)),
			
 
				+			sizeof(uint64_t));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate or pin log vector buffers for CIL insertion.
			
 
				+ *
			
 
				+ * The CIL currently uses disposable buffers for copying a snapshot of the
			
 
				+ * modified items into the log during a push. The biggest problem with this is
			
 
				+ * the requirement to allocate the disposable buffer during the commit if:
			
 
				+ *	a) does not exist; or
			
 
				+ *	b) it is too small
			
 
				+ *
			
 
				+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
			
 
				+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
			
 
				+ * the memory allocation. This means that we have a potential deadlock situation
			
 
				+ * under low memory conditions when we have lots of dirty metadata pinned in
			
 
				+ * the CIL and we need a CIL commit to occur to free memory.
			
 
				+ *
			
 
				+ * To avoid this, we need to move the memory allocation outside the
			
 
				+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
			
 
				+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
			
 
				+ * vector buffers between the check and the formatting of the item into the
			
 
				+ * log vector buffer within the xc_ctx_lock.
			
 
				+ *
			
 
				+ * Because the log vector buffer needs to be unchanged during the CIL push
			
 
				+ * process, we cannot share the buffer between the transaction commit (which
			
 
				+ * modifies the buffer) and the CIL push context that is writing the changes
			
 
				+ * into the log. This means skipping preallocation of buffer space is
			
 
				+ * unreliable, but we most definitely do not want to be allocating and freeing
			
 
				+ * buffers unnecessarily during commits when overwrites can be done safely.
			
 
				+ *
			
 
				+ * The simplest solution to this problem is to allocate a shadow buffer when a
			
 
				+ * log item is committed for the second time, and then to only use this buffer
			
 
				+ * if necessary. The buffer can remain attached to the log item until such time
			
 
				+ * it is needed, and this is the buffer that is reallocated to match the size of
			
 
				+ * the incoming modification. Then during the formatting of the item we can swap
			
 
				+ * the active buffer with the new one if we can't reuse the existing buffer. We
			
 
				+ * don't free the old buffer as it may be reused on the next modification if
			
 
				+ * it's size is right, otherwise we'll free and reallocate it at that point.
			
 
				+ *
			
 
				+ * This function builds a vector for the changes in each log item in the
			
 
				+ * transaction. It then works out the length of the buffer needed for each log
			
 
				+ * item, allocates them and attaches the vector to the log item in preparation
			
 
				+ * for the formatting step which occurs under the xc_ctx_lock.
			
 
				+ *
			
 
				+ * While this means the memory footprint goes up, it avoids the repeated
			
 
				+ * alloc/free pattern that repeated modifications of an item would otherwise
			
 
				+ * cause, and hence minimises the CPU overhead of such behaviour.
			
 
				+ */
			
 
				+static void
			
 
				+xlog_cil_alloc_shadow_bufs(
			
 
				+	struct xlog		*log,
			
 
				+	struct xfs_trans	*tp)
			
 
				+{
			
 
				+	struct xfs_log_item_desc *lidp;
			
 
				+
			
 
				+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
			
 
				+		struct xfs_log_item *lip = lidp->lid_item;
			
 
				+		struct xfs_log_vec *lv;
			
 
				+		int	niovecs = 0;
			
 
				+		int	nbytes = 0;
			
 
				+		int	buf_size;
			
 
				+		bool	ordered = false;
			
 
				+
			
 
				+		/* Skip items which aren't dirty in this transaction. */
			
 
				+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
			
 
				+			continue;
			
 
				+
			
 
				+		/* get number of vecs and size of data to be stored */
			
 
				+		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
			
 
				+
			
 
				+		/*
			
 
				+		 * Ordered items need to be tracked but we do not wish to write
			
 
				+		 * them. We need a logvec to track the object, but we do not
			
 
				+		 * need an iovec or buffer to be allocated for copying data.
			
 
				+		 */
			
 
				+		if (niovecs == XFS_LOG_VEC_ORDERED) {
			
 
				+			ordered = true;
			
 
				+			niovecs = 0;
			
 
				+			nbytes = 0;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * We 64-bit align the length of each iovec so that the start
			
 
				+		 * of the next one is naturally aligned.  We'll need to
			
 
				+		 * account for that slack space here. Then round nbytes up
			
 
				+		 * to 64-bit alignment so that the initial buffer alignment is
			
 
				+		 * easy to calculate and verify.
			
 
				+		 */
			
 
				+		nbytes += niovecs * sizeof(uint64_t);
			
 
				+		nbytes = round_up(nbytes, sizeof(uint64_t));
			
 
				+
			
 
				+		/*
			
 
				+		 * The data buffer needs to start 64-bit aligned, so round up
			
 
				+		 * that space to ensure we can align it appropriately and not
			
 
				+		 * overrun the buffer.
			
 
				+		 */
			
 
				+		buf_size = nbytes + xlog_cil_iovec_space(niovecs);
			
 
				+
			
 
				+		/*
			
 
				+		 * if we have no shadow buffer, or it is too small, we need to
			
 
				+		 * reallocate it.
			
 
				+		 */
			
 
				+		if (!lip->li_lv_shadow ||
			
 
				+		    buf_size > lip->li_lv_shadow->lv_size) {
			
 
				+
			
 
				+			/*
			
 
				+			 * We free and allocate here as a realloc would copy
			
 
				+			 * unecessary data. We don't use kmem_zalloc() for the
			
 
				+			 * same reason - we don't need to zero the data area in
			
 
				+			 * the buffer, only the log vector header and the iovec
			
 
				+			 * storage.
			
 
				+			 */
			
 
				+			kmem_free(lip->li_lv_shadow);
			
 
				+
			
 
				+			lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
			
 
				+			memset(lv, 0, xlog_cil_iovec_space(niovecs));
			
 
				+
			
 
				+			lv->lv_item = lip;
			
 
				+			lv->lv_size = buf_size;
			
 
				+			if (ordered)
			
 
				+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
			
 
				+			else
			
 
				+				lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
			
 
				+			lip->li_lv_shadow = lv;
			
 
				+		} else {
			
 
				+			/* same or smaller, optimise common overwrite case */
			
 
				+			lv = lip->li_lv_shadow;
			
 
				+			if (ordered)
			
 
				+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
			
 
				+			else
			
 
				+				lv->lv_buf_len = 0;
			
 
				+			lv->lv_bytes = 0;
			
 
				+			lv->lv_next = NULL;
			
 
				+		}
			
 
				+
			
 
				+		/* Ensure the lv is set up according to ->iop_size */
			
 
				+		lv->lv_niovecs = niovecs;
			
 
				+
			
 
				+		/* The allocated data region lies beyond the iovec region */
			
 
				+		lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Prepare the log item for insertion into the CIL. Calculate the difference in
			
 
				  * log space and vectors it will consume, and if it is a new item pin it as
			
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
 
				 	/*
			
 
				 	 * If there is no old LV, this is the first time we've seen the item in
			
 
				 	 * this CIL context and so we need to pin it. If we are replacing the
			
 
				-	 * old_lv, then remove the space it accounts for and free it.
			
 
				+	 * old_lv, then remove the space it accounts for and make it the shadow
			
 
				+	 * buffer for later freeing. In both cases we are now switching to the
			
 
				+	 * shadow buffer, so update the the pointer to it appropriately.
			
 
				 	 */
			
 
				-	if (!old_lv)
			
 
				+	if (!old_lv) {
			
 
				 		lv->lv_item->li_ops->iop_pin(lv->lv_item);
			
 
				-	else if (old_lv != lv) {
			
 
				+		lv->lv_item->li_lv_shadow = NULL;
			
 
				+	} else if (old_lv != lv) {
			
 
				 		ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
			
 
				 
			
 
				 		*diff_len -= old_lv->lv_bytes;
			
 
				 		*diff_iovecs -= old_lv->lv_niovecs;
			
 
				-		kmem_free(old_lv);
			
 
				+		lv->lv_item->li_lv_shadow = old_lv;
			
 
				 	}
			
 
				 
			
 
				 	/* attach new log vector to log item */
			
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
 
				  * write it out asynchronously without needing to relock the object that was
			
 
				  * modified at the time it gets written into the iclog.
			
 
				  *
			
 
				- * This function builds a vector for the changes in each log item in the
			
 
				- * transaction. It then works out the length of the buffer needed for each log
			
 
				- * item, allocates them and formats the vector for the item into the buffer.
			
 
				- * The buffer is then attached to the log item are then inserted into the
			
 
				- * Committed Item List for tracking until the next checkpoint is written out.
			
 
				+ * This function takes the prepared log vectors attached to each log item, and
			
 
				+ * formats the changes into the log vector buffer. The buffer it uses is
			
 
				+ * dependent on the current state of the vector in the CIL - the shadow lv is
			
 
				+ * guaranteed to be large enough for the current modification, but we will only
			
 
				+ * use that if we can't reuse the existing lv. If we can't reuse the existing
			
 
				+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
			
 
				+ * done lazily either by th enext modification or the freeing of the log item.
			
 
				  *
			
 
				  * We don't set up region headers during this process; we simply copy the
			
 
				  * regions into the flat buffer. We can do this because we still have to do a
			
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
 
				 	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
			
 
				 		struct xfs_log_item *lip = lidp->lid_item;
			
 
				 		struct xfs_log_vec *lv;
			
 
				-		struct xfs_log_vec *old_lv;
			
 
				-		int	niovecs = 0;
			
 
				-		int	nbytes = 0;
			
 
				-		int	buf_size;
			
 
				+		struct xfs_log_vec *old_lv = NULL;
			
 
				+		struct xfs_log_vec *shadow;
			
 
				 		bool	ordered = false;
			
 
				 
			
 
				 		/* Skip items which aren't dirty in this transaction. */
			
 
				 		if (!(lidp->lid_flags & XFS_LID_DIRTY))
			
 
				 			continue;
			
 
				 
			
 
				-		/* get number of vecs and size of data to be stored */
			
 
				-		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
			
 
				-
			
 
				-		/* Skip items that do not have any vectors for writing */
			
 
				-		if (!niovecs)
			
 
				-			continue;
			
 
				-
			
 
				 		/*
			
 
				-		 * Ordered items need to be tracked but we do not wish to write
			
 
				-		 * them. We need a logvec to track the object, but we do not
			
 
				-		 * need an iovec or buffer to be allocated for copying data.
			
 
				+		 * The formatting size information is already attached to
			
 
				+		 * the shadow lv on the log item.
			
 
				 		 */
			
 
				-		if (niovecs == XFS_LOG_VEC_ORDERED) {
			
 
				+		shadow = lip->li_lv_shadow;
			
 
				+		if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
			
 
				 			ordered = true;
			
 
				-			niovecs = 0;
			
 
				-			nbytes = 0;
			
 
				-		}
			
 
				 
			
 
				-		/*
			
 
				-		 * We 64-bit align the length of each iovec so that the start
			
 
				-		 * of the next one is naturally aligned.  We'll need to
			
 
				-		 * account for that slack space here. Then round nbytes up
			
 
				-		 * to 64-bit alignment so that the initial buffer alignment is
			
 
				-		 * easy to calculate and verify.
			
 
				-		 */
			
 
				-		nbytes += niovecs * sizeof(uint64_t);
			
 
				-		nbytes = round_up(nbytes, sizeof(uint64_t));
			
 
				-
			
 
				-		/* grab the old item if it exists for reservation accounting */
			
 
				-		old_lv = lip->li_lv;
			
 
				-
			
 
				-		/*
			
 
				-		 * The data buffer needs to start 64-bit aligned, so round up
			
 
				-		 * that space to ensure we can align it appropriately and not
			
 
				-		 * overrun the buffer.
			
 
				-		 */
			
 
				-		buf_size = nbytes +
			
 
				-			   round_up((sizeof(struct xfs_log_vec) +
			
 
				-				     niovecs * sizeof(struct xfs_log_iovec)),
			
 
				-				    sizeof(uint64_t));
			
 
				+		/* Skip items that do not have any vectors for writing */
			
 
				+		if (!shadow->lv_niovecs && !ordered)
			
 
				+			continue;
			
 
				 
			
 
				 		/* compare to existing item size */
			
 
				-		if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
			
 
				+		old_lv = lip->li_lv;
			
 
				+		if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
			
 
				 			/* same or smaller, optimise common overwrite case */
			
 
				 			lv = lip->li_lv;
			
 
				 			lv->lv_next = NULL;
			
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
 
				 			 */
			
 
				 			*diff_iovecs -= lv->lv_niovecs;
			
 
				 			*diff_len -= lv->lv_bytes;
			
 
				+
			
 
				+			/* Ensure the lv is set up according to ->iop_size */
			
 
				+			lv->lv_niovecs = shadow->lv_niovecs;
			
 
				+
			
 
				+			/* reset the lv buffer information for new formatting */
			
 
				+			lv->lv_buf_len = 0;
			
 
				+			lv->lv_bytes = 0;
			
 
				+			lv->lv_buf = (char *)lv +
			
 
				+					xlog_cil_iovec_space(lv->lv_niovecs);
			
 
				 		} else {
			
 
				-			/* allocate new data chunk */
			
 
				-			lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
			
 
				+			/* switch to shadow buffer! */
			
 
				+			lv = shadow;
			
 
				 			lv->lv_item = lip;
			
 
				-			lv->lv_size = buf_size;
			
 
				 			if (ordered) {
			
 
				 				/* track as an ordered logvec */
			
 
				 				ASSERT(lip->li_lv == NULL);
			
 
				-				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
			
 
				 				goto insert;
			
 
				 			}
			
 
				-			lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
			
 
				 		}
			
 
				 
			
 
				-		/* Ensure the lv is set up according to ->iop_size */
			
 
				-		lv->lv_niovecs = niovecs;
			
 
				-
			
 
				-		/* The allocated data region lies beyond the iovec region */
			
 
				-		lv->lv_buf_len = 0;
			
 
				-		lv->lv_bytes = 0;
			
 
				-		lv->lv_buf = (char *)lv + buf_size - nbytes;
			
 
				 		ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
			
 
				-
			
 
				 		lip->li_ops->iop_format(lip, lv);
			
 
				 insert:
			
 
				-		ASSERT(lv->lv_buf_len <= nbytes);
			
 
				 		xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
			
 
				 	}
			
 
				 }
			
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
 
				 	struct xlog		*log = mp->m_log;
			
 
				 	struct xfs_cil		*cil = log->l_cilp;
			
 
				 
			
 
				+	/*
			
 
				+	 * Do all necessary memory allocation before we lock the CIL.
			
 
				+	 * This ensures the allocation does not deadlock with a CIL
			
 
				+	 * push in memory reclaim (e.g. from kswapd).
			
 
				+	 */
			
 
				+	xlog_cil_alloc_shadow_bufs(log, tp);
			
 
				+
			
 
				 	/* lock out background commit */
			
 
				 	down_read(&cil->xc_ctx_lock);
			
 
				 
			
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,13 +272,15 @@ xfs_readsb(
 
				 	buf_ops = NULL;
			
 
				 
			
 
				 	/*
			
 
				-	 * Allocate a (locked) buffer to hold the superblock.
			
 
				-	 * This will be kept around at all times to optimize
			
 
				-	 * access to the superblock.
			
 
				+	 * Allocate a (locked) buffer to hold the superblock. This will be kept
			
 
				+	 * around at all times to optimize access to the superblock. Therefore,
			
 
				+	 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
			
 
				+	 * elevated.
			
 
				 	 */
			
 
				 reread:
			
 
				 	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
			
 
				-				   BTOBB(sector_size), 0, &bp, buf_ops);
			
 
				+				      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
			
 
				+				      buf_ops);
			
 
				 	if (error) {
			
 
				 		if (loud)
			
 
				 			xfs_warn(mp, "SB validate failed with error %d.", error);
			
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
 
				 	BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
			
 
				 		#structname ") is wrong, expected " #size)
			
 
				 
			
 
				+#define XFS_CHECK_OFFSET(structname, member, off) \
			
 
				+	BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
			
 
				+		"XFS: offsetof(" #structname ", " #member ") is wrong, " \
			
 
				+		"expected " #off)
			
 
				+
			
 
				 static inline void __init
			
 
				 xfs_check_ondisk_structs(void)
			
 
				 {
			
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,		8);
			
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,		16);
			
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,		4);
			
 
				+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr,	48);
			
 
				+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr,	64);
			
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,		72);
			
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,		176);
			
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,		104);
			
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,	12);
			
 
				 	 */
			
 
				 
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,	0);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,	2);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,	3);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk,	0);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen,	4);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,	8);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,	9);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,		40);
			
 
				-	XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,		8);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize,	0);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count,	2);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen,	4);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags,	6);
			
 
				+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval,	7);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,			12);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,			16);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,		8);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,		16);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,		4);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,		16);
			
 
				-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,		6);
			
 
				+	XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag,	0);
			
 
				+	XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length,	2);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,		16);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,			16);
			
 
				-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,			4);
			
 
				-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,			8);
			
 
				-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,			8);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,		8);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,		16);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,			16);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,		4);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,		3);
			
 
				+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen,		0);
			
 
				+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset,		1);
			
 
				+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name,		3);
			
 
				 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,		10);
			
 
				-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,		2);
			
 
				 
			
 
				 	/* log structures */
			
 
				 	XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,		24);
			
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
 
				 /*
			
 
				  * Copyright (c) 2014 Christoph Hellwig.
			
 
				  */
			
 
				+#include <linux/iomap.h>
			
 
				 #include "xfs.h"
			
 
				 #include "xfs_format.h"
			
 
				 #include "xfs_log_format.h"
			
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void
			
 
				-xfs_bmbt_to_iomap(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	struct iomap		*iomap,
			
 
				-	struct xfs_bmbt_irec	*imap)
			
 
				-{
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-
			
 
				-	if (imap->br_startblock == HOLESTARTBLOCK) {
			
 
				-		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				-		iomap->type = IOMAP_HOLE;
			
 
				-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
			
 
				-		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				-		iomap->type = IOMAP_DELALLOC;
			
 
				-	} else {
			
 
				-		iomap->blkno =
			
 
				-			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
			
 
				-		if (imap->br_state == XFS_EXT_UNWRITTEN)
			
 
				-			iomap->type = IOMAP_UNWRITTEN;
			
 
				-		else
			
 
				-			iomap->type = IOMAP_MAPPED;
			
 
				-	}
			
 
				-	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
			
 
				-	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Get a layout for the pNFS client.
			
 
				  */
			
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
 
				 /*
			
 
				  * From xfs_rtbitmap.c
			
 
				  */
			
 
				-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
			
 
				-		  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
			
 
				 int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
			
 
				 		      xfs_rtblock_t start, xfs_extlen_t len, int val,
			
 
				 		      xfs_rtblock_t *new, int *stat);
			
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -546,7 +546,7 @@ xfs_showargs(
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				-__uint64_t
			
 
				+static __uint64_t
			
 
				 xfs_max_file_offset(
			
 
				 	unsigned int		blockshift)
			
 
				 {
			
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
 
				 		 */
			
 
				 		xfs_restore_resvblks(mp);
			
 
				 		xfs_log_work_queue(mp);
			
 
				+		xfs_queue_eofblocks(mp);
			
 
				 	}
			
 
				 
			
 
				 	/* rw -> ro */
			
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
 
				 		 * return it to the same size.
			
 
				 		 */
			
 
				 		xfs_save_resvblks(mp);
			
 
				+
			
 
				+		/*
			
 
				+		 * Cancel background eofb scanning so it cannot race with the
			
 
				+		 * final log force+buftarg wait and deadlock the remount.
			
 
				+		 */
			
 
				+		cancel_delayed_work_sync(&mp->m_eofblocks_work);
			
 
				+
			
 
				 		xfs_quiesce_attr(mp);
			
 
				 		mp->m_flags |= XFS_MOUNT_RDONLY;
			
 
				 	}
			
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
			
 
				-		xfs_alert(mp,
			
 
				-	"EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
			
 
				-
			
 
				 	error = xfs_mountfs(mp);
			
 
				 	if (error)
			
 
				 		goto out_filestream_unmount;
			
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
 
				 	if (!xfs_log_ticket_zone)
			
 
				 		goto out_free_ioend_bioset;
			
 
				 
			
 
				-	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
			
 
				-						"xfs_bmap_free_item");
			
 
				+	xfs_bmap_free_item_zone = kmem_zone_init(
			
 
				+			sizeof(struct xfs_bmap_free_item),
			
 
				+			"xfs_bmap_free_item");
			
 
				 	if (!xfs_bmap_free_item_zone)
			
 
				 		goto out_destroy_log_ticket_zone;
			
 
				 
			
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
 
				 struct xfs_buftarg;
			
 
				 struct block_device;
			
 
				 
			
 
				-extern __uint64_t xfs_max_file_offset(unsigned int);
			
 
				-
			
 
				 extern void xfs_flush_inodes(struct xfs_mount *mp);
			
 
				 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
			
 
				 extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
			
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
 
				 {
			
 
				 	struct xfs_error_cfg	*cfg;
			
 
				 
			
 
				+	if (error < 0)
			
 
				+		error = -error;
			
 
				+
			
 
				 	switch (error) {
			
 
				 	case EIO:
			
 
				 		cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
			
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 
				 DEFINE_BUF_EVENT(xfs_buf_bawrite);
			
 
				 DEFINE_BUF_EVENT(xfs_buf_lock);
			
 
				 DEFINE_BUF_EVENT(xfs_buf_lock_done);
			
 
				+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
			
 
				 DEFINE_BUF_EVENT(xfs_buf_trylock);
			
 
				 DEFINE_BUF_EVENT(xfs_buf_unlock);
			
 
				 DEFINE_BUF_EVENT(xfs_buf_iowait);
			
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
 
				 )
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(xfs_file_class,
			
 
				-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
			
 
				-	TP_ARGS(ip, count, offset, flags),
			
 
				+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
			
 
				+	TP_ARGS(ip, count, offset),
			
 
				 	TP_STRUCT__entry(
			
 
				 		__field(dev_t, dev)
			
 
				 		__field(xfs_ino_t, ino)
			
 
				 		__field(xfs_fsize_t, size)
			
 
				 		__field(loff_t, offset)
			
 
				 		__field(size_t, count)
			
 
				-		__field(int, flags)
			
 
				 	),
			
 
				 	TP_fast_assign(
			
 
				 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
			
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 
				 		__entry->size = ip->i_d.di_size;
			
 
				 		__entry->offset = offset;
			
 
				 		__entry->count = count;
			
 
				-		__entry->flags = flags;
			
 
				 	),
			
 
				-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
			
 
				-		  "offset 0x%llx count 0x%zx ioflags %s",
			
 
				+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
			
 
				 		  MAJOR(__entry->dev), MINOR(__entry->dev),
			
 
				 		  __entry->ino,
			
 
				 		  __entry->size,
			
 
				 		  __entry->offset,
			
 
				-		  __entry->count,
			
 
				-		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
			
 
				+		  __entry->count)
			
 
				 )
			
 
				 
			
 
				 #define DEFINE_RW_EVENT(name)		\
			
 
				 DEFINE_EVENT(xfs_file_class, name,	\
			
 
				-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),	\
			
 
				-	TP_ARGS(ip, count, offset, flags))
			
 
				-DEFINE_RW_EVENT(xfs_file_read);
			
 
				+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),	\
			
 
				+	TP_ARGS(ip, count, offset))
			
 
				+DEFINE_RW_EVENT(xfs_file_buffered_read);
			
 
				+DEFINE_RW_EVENT(xfs_file_direct_read);
			
 
				+DEFINE_RW_EVENT(xfs_file_dax_read);
			
 
				 DEFINE_RW_EVENT(xfs_file_buffered_write);
			
 
				 DEFINE_RW_EVENT(xfs_file_direct_write);
			
 
				+DEFINE_RW_EVENT(xfs_file_dax_write);
			
 
				 DEFINE_RW_EVENT(xfs_file_splice_read);
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(xfs_page_class,
			
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 
				 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
			
 
				 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
			
 
				 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
			
 
				+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
			
 
				+DEFINE_IOMAP_EVENT(xfs_iomap_found);
			
 
				+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(xfs_simple_io_class,
			
 
				 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
			
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
 
				 	/* delayed logging */
			
 
				 	struct list_head		li_cil;		/* CIL pointers */
			
 
				 	struct xfs_log_vec		*li_lv;		/* active log vector */
			
 
				+	struct xfs_log_vec		*li_lv_shadow;	/* standby vector */
			
 
				 	xfs_lsn_t			li_seq;		/* CIL commit seq */
			
 
				 } xfs_log_item_t;
			
 
				 
			
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
 
				 struct dentry;
			
 
				 struct iattr;
			
 
				 struct inode;
			
 
				+struct iomap;
			
 
				 struct super_block;
			
 
				 struct vfsmount;
			
 
				 
			
@@ -187,21 +188,6 @@ struct fid {
 
				  *    get_name is not (which is possibly inconsistent)
			
 
				  */
			
 
				 
			
 
				-/* types of block ranges for multipage write mappings. */
			
 
				-#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
			
 
				-#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
			
 
				-#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
			
 
				-#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
			
 
				-
			
 
				-#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
			
 
				-
			
 
				-struct iomap {
			
 
				-	sector_t	blkno;	/* first sector of mapping */
			
 
				-	loff_t		offset;	/* file offset of mapping, bytes */
			
 
				-	u64		length;	/* length of mapping, bytes */
			
 
				-	int		type;	/* type of mapping */
			
 
				-};
			
 
				-
			
 
				 struct export_operations {
			
 
				 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
			
 
				 			struct inode *parent);
			
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -0,0 +1,70 @@
 
				+#ifndef LINUX_IOMAP_H
			
 
				+#define LINUX_IOMAP_H 1
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+struct fiemap_extent_info;
			
 
				+struct inode;
			
 
				+struct iov_iter;
			
 
				+struct kiocb;
			
 
				+struct vm_area_struct;
			
 
				+struct vm_fault;
			
 
				+
			
 
				+/*
			
 
				+ * Types of block ranges for iomap mappings:
			
 
				+ */
			
 
				+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
			
 
				+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
			
 
				+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
			
 
				+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
			
 
				+
			
 
				+/*
			
 
				+ * Magic value for blkno:
			
 
				+ */
			
 
				+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
			
 
				+
			
 
				+struct iomap {
			
 
				+	sector_t		blkno;	/* 1st sector of mapping, 512b units */
			
 
				+	loff_t			offset;	/* file offset of mapping, bytes */
			
 
				+	u64			length;	/* length of mapping, bytes */
			
 
				+	int			type;	/* type of mapping */
			
 
				+	struct block_device	*bdev;	/* block device for I/O */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
			
 
				+ */
			
 
				+#define IOMAP_WRITE		(1 << 0)
			
 
				+#define IOMAP_ZERO		(1 << 1)
			
 
				+
			
 
				+struct iomap_ops {
			
 
				+	/*
			
 
				+	 * Return the existing mapping at pos, or reserve space starting at
			
 
				+	 * pos for up to length, as long as we can do it as a single mapping.
			
 
				+	 * The actual length is returned in iomap->length.
			
 
				+	 */
			
 
				+	int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
			
 
				+			unsigned flags, struct iomap *iomap);
			
 
				+
			
 
				+	/*
			
 
				+	 * Commit and/or unreserve space previous allocated using iomap_begin.
			
 
				+	 * Written indicates the length of the successful write operation which
			
 
				+	 * needs to be commited, while the rest needs to be unreserved.
			
 
				+	 * Written might be zero if no data was written.
			
 
				+	 */
			
 
				+	int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
			
 
				+			ssize_t written, unsigned flags, struct iomap *iomap);
			
 
				+};
			
 
				+
			
 
				+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
			
 
				+		struct iomap_ops *ops);
			
 
				+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
			
 
				+		bool *did_zero, struct iomap_ops *ops);
			
 
				+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
			
 
				+		struct iomap_ops *ops);
			
 
				+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
			
 
				+		struct iomap_ops *ops);
			
 
				+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
			
 
				+		loff_t start, loff_t len, struct iomap_ops *ops);
			
 
				+
			
 
				+#endif /* LINUX_IOMAP_H */