9 years ago · 9b7fad2076
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
 
				 	depends on (64BIT || LBDAF)
			
 
				 	select EXPORTFS
			
 
				 	select LIBCRC32C
			
 
				+	select FS_IOMAP
			
 
				 	help
			
 
				 	  XFS is a high performance journaling filesystem which originated
			
 
				 	  on the SGI IRIX platform.  It is completely multi-threaded, can
			
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1143,6 +1143,8 @@ __xfs_get_blocks(
 
				 	ssize_t			size;
			
 
				 	int			new = 0;
			
 
				 
			
 
				+	BUG_ON(create && !direct);
			
 
				+
			
 
				 	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				 		return -EIO;
			
 
				 
			
@@ -1150,22 +1152,14 @@ __xfs_get_blocks(
 
				 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
			
 
				 	size = bh_result->b_size;
			
 
				 
			
 
				-	if (!create && direct && offset >= i_size_read(inode))
			
 
				+	if (!create && offset >= i_size_read(inode))
			
 
				 		return 0;
			
 
				 
			
 
				 	/*
			
 
				 	 * Direct I/O is usually done on preallocated files, so try getting
			
 
				-	 * a block mapping without an exclusive lock first.  For buffered
			
 
				-	 * writes we already have the exclusive iolock anyway, so avoiding
			
 
				-	 * a lock roundtrip here by taking the ilock exclusive from the
			
 
				-	 * beginning is a useful micro optimization.
			
 
				+	 * a block mapping without an exclusive lock first.
			
 
				 	 */
			
 
				-	if (create && !direct) {
			
 
				-		lockmode = XFS_ILOCK_EXCL;
			
 
				-		xfs_ilock(ip, lockmode);
			
 
				-	} else {
			
 
				-		lockmode = xfs_ilock_data_map_shared(ip);
			
 
				-	}
			
 
				+	lockmode = xfs_ilock_data_map_shared(ip);
			
 
				 
			
 
				 	ASSERT(offset <= mp->m_super->s_maxbytes);
			
 
				 	if (offset + size > mp->m_super->s_maxbytes)
			
@@ -1184,37 +1178,19 @@ __xfs_get_blocks(
 
				 	     (imap.br_startblock == HOLESTARTBLOCK ||
			
 
				 	      imap.br_startblock == DELAYSTARTBLOCK) ||
			
 
				 	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
			
 
				-		if (direct || xfs_get_extsz_hint(ip)) {
			
 
				-			/*
			
 
				-			 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				-			 * is unlocked on return.
			
 
				-			 */
			
 
				-			if (lockmode == XFS_ILOCK_EXCL)
			
 
				-				xfs_ilock_demote(ip, lockmode);
			
 
				-
			
 
				-			error = xfs_iomap_write_direct(ip, offset, size,
			
 
				-						       &imap, nimaps);
			
 
				-			if (error)
			
 
				-				return error;
			
 
				-			new = 1;
			
 
				+		/*
			
 
				+		 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				+		 * is unlocked on return.
			
 
				+		 */
			
 
				+		if (lockmode == XFS_ILOCK_EXCL)
			
 
				+			xfs_ilock_demote(ip, lockmode);
			
 
				 
			
 
				-		} else {
			
 
				-			/*
			
 
				-			 * Delalloc reservations do not require a transaction,
			
 
				-			 * we can go on without dropping the lock here. If we
			
 
				-			 * are allocating a new delalloc block, make sure that
			
 
				-			 * we set the new flag so that we mark the buffer new so
			
 
				-			 * that we know that it is newly allocated if the write
			
 
				-			 * fails.
			
 
				-			 */
			
 
				-			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
			
 
				-				new = 1;
			
 
				-			error = xfs_iomap_write_delay(ip, offset, size, &imap);
			
 
				-			if (error)
			
 
				-				goto out_unlock;
			
 
				+		error = xfs_iomap_write_direct(ip, offset, size,
			
 
				+					       &imap, nimaps);
			
 
				+		if (error)
			
 
				+			return error;
			
 
				+		new = 1;
			
 
				 
			
 
				-			xfs_iunlock(ip, lockmode);
			
 
				-		}
			
 
				 		trace_xfs_get_blocks_alloc(ip, offset, size,
			
 
				 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
			
 
				 						   : XFS_IO_DELALLOC, &imap);
			
@@ -1235,9 +1211,7 @@ __xfs_get_blocks(
 
				 	}
			
 
				 
			
 
				 	/* trim mapping down to size requested */
			
 
				-	if (direct || size > (1 << inode->i_blkbits))
			
 
				-		xfs_map_trim_size(inode, iblock, bh_result,
			
 
				-				  &imap, offset, size);
			
 
				+	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
			
 
				 
			
 
				 	/*
			
 
				 	 * For unwritten extents do not report a disk address in the buffered
			
@@ -1250,7 +1224,7 @@ __xfs_get_blocks(
 
				 		if (ISUNWRITTEN(&imap))
			
 
				 			set_buffer_unwritten(bh_result);
			
 
				 		/* direct IO needs special help */
			
 
				-		if (create && direct) {
			
 
				+		if (create) {
			
 
				 			if (dax_fault)
			
 
				 				ASSERT(!ISUNWRITTEN(&imap));
			
 
				 			else
			
@@ -1279,14 +1253,7 @@ __xfs_get_blocks(
 
				 	     (new || ISUNWRITTEN(&imap))))
			
 
				 		set_buffer_new(bh_result);
			
 
				 
			
 
				-	if (imap.br_startblock == DELAYSTARTBLOCK) {
			
 
				-		BUG_ON(direct);
			
 
				-		if (create) {
			
 
				-			set_buffer_uptodate(bh_result);
			
 
				-			set_buffer_mapped(bh_result);
			
 
				-			set_buffer_delay(bh_result);
			
 
				-		}
			
 
				-	}
			
 
				+	BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
			
 
				 
			
 
				 	return 0;
			
 
				 
			
@@ -1427,216 +1394,6 @@ xfs_vm_direct_IO(
 
				 			xfs_get_blocks_direct, endio, NULL, flags);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Punch out the delalloc blocks we have already allocated.
			
 
				- *
			
 
				- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
			
 
				- * as the page is still locked at this point.
			
 
				- */
			
 
				-STATIC void
			
 
				-xfs_vm_kill_delalloc_range(
			
 
				-	struct inode		*inode,
			
 
				-	loff_t			start,
			
 
				-	loff_t			end)
			
 
				-{
			
 
				-	struct xfs_inode	*ip = XFS_I(inode);
			
 
				-	xfs_fileoff_t		start_fsb;
			
 
				-	xfs_fileoff_t		end_fsb;
			
 
				-	int			error;
			
 
				-
			
 
				-	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
			
 
				-	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
			
 
				-	if (end_fsb <= start_fsb)
			
 
				-		return;
			
 
				-
			
 
				-	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
			
 
				-						end_fsb - start_fsb);
			
 
				-	if (error) {
			
 
				-		/* something screwed, just bail */
			
 
				-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
			
 
				-			xfs_alert(ip->i_mount,
			
 
				-		"xfs_vm_write_failed: unable to clean up ino %lld",
			
 
				-					ip->i_ino);
			
 
				-		}
			
 
				-	}
			
 
				-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-}
			
 
				-
			
 
				-STATIC void
			
 
				-xfs_vm_write_failed(
			
 
				-	struct inode		*inode,
			
 
				-	struct page		*page,
			
 
				-	loff_t			pos,
			
 
				-	unsigned		len)
			
 
				-{
			
 
				-	loff_t			block_offset;
			
 
				-	loff_t			block_start;
			
 
				-	loff_t			block_end;
			
 
				-	loff_t			from = pos & (PAGE_SIZE - 1);
			
 
				-	loff_t			to = from + len;
			
 
				-	struct buffer_head	*bh, *head;
			
 
				-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
			
 
				-
			
 
				-	/*
			
 
				-	 * The request pos offset might be 32 or 64 bit, this is all fine
			
 
				-	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
			
 
				-	 * platform, the high 32-bit will be masked off if we evaluate the
			
 
				-	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
			
 
				-	 * 0xfffff000 as an unsigned long, hence the result is incorrect
			
 
				-	 * which could cause the following ASSERT failed in most cases.
			
 
				-	 * In order to avoid this, we can evaluate the block_offset of the
			
 
				-	 * start of the page by using shifts rather than masks the mismatch
			
 
				-	 * problem.
			
 
				-	 */
			
 
				-	block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
			
 
				-
			
 
				-	ASSERT(block_offset + from == pos);
			
 
				-
			
 
				-	head = page_buffers(page);
			
 
				-	block_start = 0;
			
 
				-	for (bh = head; bh != head || !block_start;
			
 
				-	     bh = bh->b_this_page, block_start = block_end,
			
 
				-				   block_offset += bh->b_size) {
			
 
				-		block_end = block_start + bh->b_size;
			
 
				-
			
 
				-		/* skip buffers before the write */
			
 
				-		if (block_end <= from)
			
 
				-			continue;
			
 
				-
			
 
				-		/* if the buffer is after the write, we're done */
			
 
				-		if (block_start >= to)
			
 
				-			break;
			
 
				-
			
 
				-		/*
			
 
				-		 * Process delalloc and unwritten buffers beyond EOF. We can
			
 
				-		 * encounter unwritten buffers in the event that a file has
			
 
				-		 * post-EOF unwritten extents and an extending write happens to
			
 
				-		 * fail (e.g., an unaligned write that also involves a delalloc
			
 
				-		 * to the same page).
			
 
				-		 */
			
 
				-		if (!buffer_delay(bh) && !buffer_unwritten(bh))
			
 
				-			continue;
			
 
				-
			
 
				-		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
			
 
				-		    block_offset < i_size_read(inode))
			
 
				-			continue;
			
 
				-
			
 
				-		if (buffer_delay(bh))
			
 
				-			xfs_vm_kill_delalloc_range(inode, block_offset,
			
 
				-						   block_offset + bh->b_size);
			
 
				-
			
 
				-		/*
			
 
				-		 * This buffer does not contain data anymore. make sure anyone
			
 
				-		 * who finds it knows that for certain.
			
 
				-		 */
			
 
				-		clear_buffer_delay(bh);
			
 
				-		clear_buffer_uptodate(bh);
			
 
				-		clear_buffer_mapped(bh);
			
 
				-		clear_buffer_new(bh);
			
 
				-		clear_buffer_dirty(bh);
			
 
				-		clear_buffer_unwritten(bh);
			
 
				-	}
			
 
				-
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * This used to call block_write_begin(), but it unlocks and releases the page
			
 
				- * on error, and we need that page to be able to punch stale delalloc blocks out
			
 
				- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
			
 
				- * the appropriate point.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_vm_write_begin(
			
 
				-	struct file		*file,
			
 
				-	struct address_space	*mapping,
			
 
				-	loff_t			pos,
			
 
				-	unsigned		len,
			
 
				-	unsigned		flags,
			
 
				-	struct page		**pagep,
			
 
				-	void			**fsdata)
			
 
				-{
			
 
				-	pgoff_t			index = pos >> PAGE_SHIFT;
			
 
				-	struct page		*page;
			
 
				-	int			status;
			
 
				-	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
			
 
				-
			
 
				-	ASSERT(len <= PAGE_SIZE);
			
 
				-
			
 
				-	page = grab_cache_page_write_begin(mapping, index, flags);
			
 
				-	if (!page)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	status = __block_write_begin(page, pos, len, xfs_get_blocks);
			
 
				-	if (xfs_mp_fail_writes(mp))
			
 
				-		status = -EIO;
			
 
				-	if (unlikely(status)) {
			
 
				-		struct inode	*inode = mapping->host;
			
 
				-		size_t		isize = i_size_read(inode);
			
 
				-
			
 
				-		xfs_vm_write_failed(inode, page, pos, len);
			
 
				-		unlock_page(page);
			
 
				-
			
 
				-		/*
			
 
				-		 * If the write is beyond EOF, we only want to kill blocks
			
 
				-		 * allocated in this write, not blocks that were previously
			
 
				-		 * written successfully.
			
 
				-		 */
			
 
				-		if (xfs_mp_fail_writes(mp))
			
 
				-			isize = 0;
			
 
				-		if (pos + len > isize) {
			
 
				-			ssize_t start = max_t(ssize_t, pos, isize);
			
 
				-
			
 
				-			truncate_pagecache_range(inode, start, pos + len);
			
 
				-		}
			
 
				-
			
 
				-		put_page(page);
			
 
				-		page = NULL;
			
 
				-	}
			
 
				-
			
 
				-	*pagep = page;
			
 
				-	return status;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
			
 
				- * this specific write because they will never be written. Previous writes
			
 
				- * beyond EOF where block allocation succeeded do not need to be trashed, so
			
 
				- * only new blocks from this write should be trashed. For blocks within
			
 
				- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
			
 
				- * written with all the other valid data.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_vm_write_end(
			
 
				-	struct file		*file,
			
 
				-	struct address_space	*mapping,
			
 
				-	loff_t			pos,
			
 
				-	unsigned		len,
			
 
				-	unsigned		copied,
			
 
				-	struct page		*page,
			
 
				-	void			*fsdata)
			
 
				-{
			
 
				-	int			ret;
			
 
				-
			
 
				-	ASSERT(len <= PAGE_SIZE);
			
 
				-
			
 
				-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
			
 
				-	if (unlikely(ret < len)) {
			
 
				-		struct inode	*inode = mapping->host;
			
 
				-		size_t		isize = i_size_read(inode);
			
 
				-		loff_t		to = pos + len;
			
 
				-
			
 
				-		if (to > isize) {
			
 
				-			/* only kill blocks in this write beyond EOF */
			
 
				-			if (pos > isize)
			
 
				-				isize = pos;
			
 
				-			xfs_vm_kill_delalloc_range(inode, isize, to);
			
 
				-			truncate_pagecache_range(inode, isize, to);
			
 
				-		}
			
 
				-	}
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				 STATIC sector_t
			
 
				 xfs_vm_bmap(
			
 
				 	struct address_space	*mapping,
			
@@ -1747,8 +1504,6 @@ const struct address_space_operations xfs_address_space_operations = {
 
				 	.set_page_dirty		= xfs_vm_set_page_dirty,
			
 
				 	.releasepage		= xfs_vm_releasepage,
			
 
				 	.invalidatepage		= xfs_vm_invalidatepage,
			
 
				-	.write_begin		= xfs_vm_write_begin,
			
 
				-	.write_end		= xfs_vm_write_end,
			
 
				 	.bmap			= xfs_vm_bmap,
			
 
				 	.direct_IO		= xfs_vm_direct_IO,
			
 
				 	.migratepage		= buffer_migrate_page,
			
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1087,99 +1087,120 @@ error1:	/* Just cancel transaction */
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Zero file bytes between startoff and endoff inclusive.
			
 
				- * The iolock is held exclusive and no blocks are buffered.
			
 
				- *
			
 
				- * This function is used by xfs_free_file_space() to zero
			
 
				- * partial blocks when the range to free is not block aligned.
			
 
				- * When unreserving space with boundaries that are not block
			
 
				- * aligned we round up the start and round down the end
			
 
				- * boundaries and then use this function to zero the parts of
			
 
				- * the blocks that got dropped during the rounding.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_zero_remaining_bytes(
			
 
				-	xfs_inode_t		*ip,
			
 
				-	xfs_off_t		startoff,
			
 
				-	xfs_off_t		endoff)
			
 
				+static int
			
 
				+xfs_unmap_extent(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_fileoff_t		startoffset_fsb,
			
 
				+	xfs_filblks_t		len_fsb,
			
 
				+	int			*done)
			
 
				 {
			
 
				-	xfs_bmbt_irec_t		imap;
			
 
				-	xfs_fileoff_t		offset_fsb;
			
 
				-	xfs_off_t		lastoffset;
			
 
				-	xfs_off_t		offset;
			
 
				-	xfs_buf_t		*bp;
			
 
				-	xfs_mount_t		*mp = ip->i_mount;
			
 
				-	int			nimap;
			
 
				-	int			error = 0;
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_trans	*tp;
			
 
				+	struct xfs_bmap_free	free_list;
			
 
				+	xfs_fsblock_t		firstfsb;
			
 
				+	uint			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
			
 
				+	int			error;
			
 
				 
			
 
				-	/*
			
 
				-	 * Avoid doing I/O beyond eof - it's not necessary
			
 
				-	 * since nothing can read beyond eof.  The space will
			
 
				-	 * be zeroed when the file is extended anyway.
			
 
				-	 */
			
 
				-	if (startoff >= XFS_ISIZE(ip))
			
 
				-		return 0;
			
 
				+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
			
 
				+	if (error) {
			
 
				+		ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
			
 
				+		return error;
			
 
				+	}
			
 
				 
			
 
				-	if (endoff > XFS_ISIZE(ip))
			
 
				-		endoff = XFS_ISIZE(ip);
			
 
				+	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+	error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
			
 
				+			ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
			
 
				+	if (error)
			
 
				+		goto out_trans_cancel;
			
 
				 
			
 
				-	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
			
 
				-		uint lock_mode;
			
 
				+	xfs_trans_ijoin(tp, ip, 0);
			
 
				 
			
 
				-		offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				-		nimap = 1;
			
 
				+	xfs_bmap_init(&free_list, &firstfsb);
			
 
				+	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
			
 
				+			&free_list, done);
			
 
				+	if (error)
			
 
				+		goto out_bmap_cancel;
			
 
				 
			
 
				-		lock_mode = xfs_ilock_data_map_shared(ip);
			
 
				-		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
			
 
				-		xfs_iunlock(ip, lock_mode);
			
 
				+	error = xfs_bmap_finish(&tp, &free_list, NULL);
			
 
				+	if (error)
			
 
				+		goto out_bmap_cancel;
			
 
				 
			
 
				-		if (error || nimap < 1)
			
 
				-			break;
			
 
				-		ASSERT(imap.br_blockcount >= 1);
			
 
				-		ASSERT(imap.br_startoff == offset_fsb);
			
 
				-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				+	error = xfs_trans_commit(tp);
			
 
				+out_unlock:
			
 
				+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+	return error;
			
 
				 
			
 
				-		if (imap.br_startblock == HOLESTARTBLOCK ||
			
 
				-		    imap.br_state == XFS_EXT_UNWRITTEN) {
			
 
				-			/* skip the entire extent */
			
 
				-			lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
			
 
				-						      imap.br_blockcount) - 1;
			
 
				-			continue;
			
 
				-		}
			
 
				+out_bmap_cancel:
			
 
				+	xfs_bmap_cancel(&free_list);
			
 
				+out_trans_cancel:
			
 
				+	xfs_trans_cancel(tp);
			
 
				+	goto out_unlock;
			
 
				+}
			
 
				 
			
 
				-		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
			
 
				-		if (lastoffset > endoff)
			
 
				-			lastoffset = endoff;
			
 
				+static int
			
 
				+xfs_adjust_extent_unmap_boundaries(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_fileoff_t		*startoffset_fsb,
			
 
				+	xfs_fileoff_t		*endoffset_fsb)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_bmbt_irec	imap;
			
 
				+	int			nimap, error;
			
 
				+	xfs_extlen_t		mod = 0;
			
 
				 
			
 
				-		/* DAX can just zero the backing device directly */
			
 
				-		if (IS_DAX(VFS_I(ip))) {
			
 
				-			error = dax_zero_page_range(VFS_I(ip), offset,
			
 
				-						    lastoffset - offset + 1,
			
 
				-						    xfs_get_blocks_direct);
			
 
				-			if (error)
			
 
				-				return error;
			
 
				-			continue;
			
 
				-		}
			
 
				+	nimap = 1;
			
 
				+	error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				 
			
 
				-		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
			
 
				-				mp->m_rtdev_targp : mp->m_ddev_targp,
			
 
				-				xfs_fsb_to_db(ip, imap.br_startblock),
			
 
				-				BTOBB(mp->m_sb.sb_blocksize),
			
 
				-				0, &bp, NULL);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				+	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				+		xfs_daddr_t	block;
			
 
				 
			
 
				-		memset(bp->b_addr +
			
 
				-				(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
			
 
				-		       0, lastoffset - offset + 1);
			
 
				+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				+		block = imap.br_startblock;
			
 
				+		mod = do_div(block, mp->m_sb.sb_rextsize);
			
 
				+		if (mod)
			
 
				+			*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
			
 
				+	}
			
 
				 
			
 
				-		error = xfs_bwrite(bp);
			
 
				-		xfs_buf_relse(bp);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				+	nimap = 1;
			
 
				+	error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+
			
 
				+	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				+		mod++;
			
 
				+		if (mod && mod != mp->m_sb.sb_rextsize)
			
 
				+			*endoffset_fsb -= mod;
			
 
				 	}
			
 
				-	return error;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_flush_unmap_range(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_off_t		offset,
			
 
				+	xfs_off_t		len)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct inode		*inode = VFS_I(ip);
			
 
				+	xfs_off_t		rounding, start, end;
			
 
				+	int			error;
			
 
				+
			
 
				+	/* wait for the completion of any pending DIOs */
			
 
				+	inode_dio_wait(inode);
			
 
				+
			
 
				+	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
			
 
				+	start = round_down(offset, rounding);
			
 
				+	end = round_up(offset + len, rounding) - 1;
			
 
				+
			
 
				+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+	truncate_pagecache_range(inode, start, end);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 int
			
@@ -1188,24 +1209,10 @@ xfs_free_file_space(
 
				 	xfs_off_t		offset,
			
 
				 	xfs_off_t		len)
			
 
				 {
			
 
				-	int			done;
			
 
				-	xfs_fileoff_t		endoffset_fsb;
			
 
				-	int			error;
			
 
				-	xfs_fsblock_t		firstfsb;
			
 
				-	xfs_bmap_free_t		free_list;
			
 
				-	xfs_bmbt_irec_t		imap;
			
 
				-	xfs_off_t		ioffset;
			
 
				-	xfs_off_t		iendoffset;
			
 
				-	xfs_extlen_t		mod=0;
			
 
				-	xfs_mount_t		*mp;
			
 
				-	int			nimap;
			
 
				-	uint			resblks;
			
 
				-	xfs_off_t		rounding;
			
 
				-	int			rt;
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				 	xfs_fileoff_t		startoffset_fsb;
			
 
				-	xfs_trans_t		*tp;
			
 
				-
			
 
				-	mp = ip->i_mount;
			
 
				+	xfs_fileoff_t		endoffset_fsb;
			
 
				+	int			done = 0, error;
			
 
				 
			
 
				 	trace_xfs_free_file_space(ip);
			
 
				 
			
@@ -1213,135 +1220,45 @@ xfs_free_file_space(
 
				 	if (error)
			
 
				 		return error;
			
 
				 
			
 
				-	error = 0;
			
 
				 	if (len <= 0)	/* if nothing being freed */
			
 
				-		return error;
			
 
				-	rt = XFS_IS_REALTIME_INODE(ip);
			
 
				-	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
			
 
				-	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
			
 
				-
			
 
				-	/* wait for the completion of any pending DIOs */
			
 
				-	inode_dio_wait(VFS_I(ip));
			
 
				+		return 0;
			
 
				 
			
 
				-	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
			
 
				-	ioffset = round_down(offset, rounding);
			
 
				-	iendoffset = round_up(offset + len, rounding) - 1;
			
 
				-	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
			
 
				-					     iendoffset);
			
 
				+	error = xfs_flush_unmap_range(ip, offset, len);
			
 
				 	if (error)
			
 
				-		goto out;
			
 
				-	truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
			
 
				+		return error;
			
 
				+
			
 
				+	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
			
 
				+	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
			
 
				 
			
 
				 	/*
			
 
				-	 * Need to zero the stuff we're not freeing, on disk.
			
 
				-	 * If it's a realtime file & can't use unwritten extents then we
			
 
				-	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
			
 
				-	 * will take care of it for us.
			
 
				+	 * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
			
 
				+	 * and we can't use unwritten extents then we actually need to ensure
			
 
				+	 * to zero the whole extent, otherwise we just need to take of block
			
 
				+	 * boundaries, and xfs_bunmapi will handle the rest.
			
 
				 	 */
			
 
				-	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
			
 
				-		nimap = 1;
			
 
				-		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
			
 
				-					&imap, &nimap, 0);
			
 
				+	if (XFS_IS_REALTIME_INODE(ip) &&
			
 
				+	    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
			
 
				+		error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
			
 
				+				&endoffset_fsb);
			
 
				 		if (error)
			
 
				-			goto out;
			
 
				-		ASSERT(nimap == 0 || nimap == 1);
			
 
				-		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				-			xfs_daddr_t	block;
			
 
				-
			
 
				-			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				-			block = imap.br_startblock;
			
 
				-			mod = do_div(block, mp->m_sb.sb_rextsize);
			
 
				-			if (mod)
			
 
				-				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
			
 
				-		}
			
 
				-		nimap = 1;
			
 
				-		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
			
 
				-					&imap, &nimap, 0);
			
 
				-		if (error)
			
 
				-			goto out;
			
 
				-		ASSERT(nimap == 0 || nimap == 1);
			
 
				-		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
			
 
				-			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
			
 
				-			mod++;
			
 
				-			if (mod && (mod != mp->m_sb.sb_rextsize))
			
 
				-				endoffset_fsb -= mod;
			
 
				-		}
			
 
				-	}
			
 
				-	if ((done = (endoffset_fsb <= startoffset_fsb)))
			
 
				-		/*
			
 
				-		 * One contiguous piece to clear
			
 
				-		 */
			
 
				-		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
			
 
				-	else {
			
 
				-		/*
			
 
				-		 * Some full blocks, possibly two pieces to clear
			
 
				-		 */
			
 
				-		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
			
 
				-			error = xfs_zero_remaining_bytes(ip, offset,
			
 
				-				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
			
 
				-		if (!error &&
			
 
				-		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
			
 
				-			error = xfs_zero_remaining_bytes(ip,
			
 
				-				XFS_FSB_TO_B(mp, endoffset_fsb),
			
 
				-				offset + len - 1);
			
 
				+			return error;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * free file space until done or until there is an error
			
 
				-	 */
			
 
				-	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
			
 
				-	while (!error && !done) {
			
 
				-
			
 
				-		/*
			
 
				-		 * allocate and setup the transaction. Allow this
			
 
				-		 * transaction to dip into the reserve blocks to ensure
			
 
				-		 * the freeing of the space succeeds at ENOSPC.
			
 
				-		 */
			
 
				-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
			
 
				-				&tp);
			
 
				-		if (error) {
			
 
				-			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
			
 
				-			break;
			
 
				+	if (endoffset_fsb > startoffset_fsb) {
			
 
				+		while (!done) {
			
 
				+			error = xfs_unmap_extent(ip, startoffset_fsb,
			
 
				+					endoffset_fsb - startoffset_fsb, &done);
			
 
				+			if (error)
			
 
				+				return error;
			
 
				 		}
			
 
				-		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-		error = xfs_trans_reserve_quota(tp, mp,
			
 
				-				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
			
 
				-				resblks, 0, XFS_QMOPT_RES_REGBLKS);
			
 
				-		if (error)
			
 
				-			goto error1;
			
 
				-
			
 
				-		xfs_trans_ijoin(tp, ip, 0);
			
 
				-
			
 
				-		/*
			
 
				-		 * issue the bunmapi() call to free the blocks
			
 
				-		 */
			
 
				-		xfs_bmap_init(&free_list, &firstfsb);
			
 
				-		error = xfs_bunmapi(tp, ip, startoffset_fsb,
			
 
				-				  endoffset_fsb - startoffset_fsb,
			
 
				-				  0, 2, &firstfsb, &free_list, &done);
			
 
				-		if (error)
			
 
				-			goto error0;
			
 
				-
			
 
				-		/*
			
 
				-		 * complete the transaction
			
 
				-		 */
			
 
				-		error = xfs_bmap_finish(&tp, &free_list, NULL);
			
 
				-		if (error)
			
 
				-			goto error0;
			
 
				-
			
 
				-		error = xfs_trans_commit(tp);
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				 	}
			
 
				 
			
 
				- out:
			
 
				-	return error;
			
 
				-
			
 
				- error0:
			
 
				-	xfs_bmap_cancel(&free_list);
			
 
				- error1:
			
 
				-	xfs_trans_cancel(tp);
			
 
				-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-	goto out;
			
 
				+	/*
			
 
				+	 * Now that we've unmap all full blocks we'll have to zero out any
			
 
				+	 * partial block at the beginning and/or end.  xfs_zero_range is
			
 
				+	 * smart enough to skip any holes, including those we just created.
			
 
				+	 */
			
 
				+	return xfs_zero_range(ip, offset, len, NULL);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 
				 #include "xfs_log.h"
			
 
				 #include "xfs_icache.h"
			
 
				 #include "xfs_pnfs.h"
			
 
				+#include "xfs_iomap.h"
			
 
				 
			
 
				 #include <linux/dcache.h>
			
 
				 #include <linux/falloc.h>
			
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * xfs_iozero clears the specified range supplied via the page cache (except in
			
 
				- * the DAX case). Writes through the page cache will allocate blocks over holes,
			
 
				- * though the callers usually map the holes first and avoid them. If a block is
			
 
				- * not completely zeroed, then it will be read from disk before being partially
			
 
				- * zeroed.
			
 
				- *
			
 
				- * In the DAX case, we can just directly write to the underlying pages. This
			
 
				- * will not allocate blocks, but will avoid holes and unwritten extents and so
			
 
				- * not do unnecessary work.
			
 
				+ * Clear the specified ranges to zero through either the pagecache or DAX.
			
 
				+ * Holes and unwritten extents will be left as-is as they already are zeroed.
			
 
				  */
			
 
				 int
			
 
				-xfs_iozero(
			
 
				-	struct xfs_inode	*ip,	/* inode			*/
			
 
				-	loff_t			pos,	/* offset in file		*/
			
 
				-	size_t			count)	/* size of data to zero		*/
			
 
				+xfs_zero_range(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_off_t		pos,
			
 
				+	xfs_off_t		count,
			
 
				+	bool			*did_zero)
			
 
				 {
			
 
				-	struct page		*page;
			
 
				-	struct address_space	*mapping;
			
 
				-	int			status = 0;
			
 
				-
			
 
				-
			
 
				-	mapping = VFS_I(ip)->i_mapping;
			
 
				-	do {
			
 
				-		unsigned offset, bytes;
			
 
				-		void *fsdata;
			
 
				-
			
 
				-		offset = (pos & (PAGE_SIZE -1)); /* Within page */
			
 
				-		bytes = PAGE_SIZE - offset;
			
 
				-		if (bytes > count)
			
 
				-			bytes = count;
			
 
				-
			
 
				-		if (IS_DAX(VFS_I(ip))) {
			
 
				-			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
			
 
				-						     xfs_get_blocks_direct);
			
 
				-			if (status)
			
 
				-				break;
			
 
				-		} else {
			
 
				-			status = pagecache_write_begin(NULL, mapping, pos, bytes,
			
 
				-						AOP_FLAG_UNINTERRUPTIBLE,
			
 
				-						&page, &fsdata);
			
 
				-			if (status)
			
 
				-				break;
			
 
				-
			
 
				-			zero_user(page, offset, bytes);
			
 
				-
			
 
				-			status = pagecache_write_end(NULL, mapping, pos, bytes,
			
 
				-						bytes, page, fsdata);
			
 
				-			WARN_ON(status <= 0); /* can't return less than zero! */
			
 
				-			status = 0;
			
 
				-		}
			
 
				-		pos += bytes;
			
 
				-		count -= bytes;
			
 
				-	} while (count);
			
 
				-
			
 
				-	return status;
			
 
				+	return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
			
 
				 }
			
 
				 
			
 
				 int
			
@@ -423,49 +380,6 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This routine is called to handle zeroing any space in the last block of the
			
 
				- * file that is beyond the EOF.  We do this since the size is being increased
			
 
				- * without writing anything to that block and we don't want to read the
			
 
				- * garbage on the disk.
			
 
				- */
			
 
				-STATIC int				/* error (positive) */
			
 
				-xfs_zero_last_block(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	xfs_fsize_t		offset,
			
 
				-	xfs_fsize_t		isize,
			
 
				-	bool			*did_zeroing)
			
 
				-{
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	xfs_fileoff_t		last_fsb = XFS_B_TO_FSBT(mp, isize);
			
 
				-	int			zero_offset = XFS_B_FSB_OFFSET(mp, isize);
			
 
				-	int			zero_len;
			
 
				-	int			nimaps = 1;
			
 
				-	int			error = 0;
			
 
				-	struct xfs_bmbt_irec	imap;
			
 
				-
			
 
				-	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-	error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
			
 
				-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				-
			
 
				-	ASSERT(nimaps > 0);
			
 
				-
			
 
				-	/*
			
 
				-	 * If the block underlying isize is just a hole, then there
			
 
				-	 * is nothing to zero.
			
 
				-	 */
			
 
				-	if (imap.br_startblock == HOLESTARTBLOCK)
			
 
				-		return 0;
			
 
				-
			
 
				-	zero_len = mp->m_sb.sb_blocksize - zero_offset;
			
 
				-	if (isize + zero_len > offset)
			
 
				-		zero_len = offset - isize;
			
 
				-	*did_zeroing = true;
			
 
				-	return xfs_iozero(ip, isize, zero_len);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Zero any on disk space between the current EOF and the new, larger EOF.
			
 
				  *
			
@@ -484,94 +398,11 @@ xfs_zero_eof(
 
				 	xfs_fsize_t		isize,		/* current inode size */
			
 
				 	bool			*did_zeroing)
			
 
				 {
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	xfs_fileoff_t		start_zero_fsb;
			
 
				-	xfs_fileoff_t		end_zero_fsb;
			
 
				-	xfs_fileoff_t		zero_count_fsb;
			
 
				-	xfs_fileoff_t		last_fsb;
			
 
				-	xfs_fileoff_t		zero_off;
			
 
				-	xfs_fsize_t		zero_len;
			
 
				-	int			nimaps;
			
 
				-	int			error = 0;
			
 
				-	struct xfs_bmbt_irec	imap;
			
 
				-
			
 
				 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
			
 
				 	ASSERT(offset > isize);
			
 
				 
			
 
				 	trace_xfs_zero_eof(ip, isize, offset - isize);
			
 
				-
			
 
				-	/*
			
 
				-	 * First handle zeroing the block on which isize resides.
			
 
				-	 *
			
 
				-	 * We only zero a part of that block so it is handled specially.
			
 
				-	 */
			
 
				-	if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
			
 
				-		error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Calculate the range between the new size and the old where blocks
			
 
				-	 * needing to be zeroed may exist.
			
 
				-	 *
			
 
				-	 * To get the block where the last byte in the file currently resides,
			
 
				-	 * we need to subtract one from the size and truncate back to a block
			
 
				-	 * boundary.  We subtract 1 in case the size is exactly on a block
			
 
				-	 * boundary.
			
 
				-	 */
			
 
				-	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
			
 
				-	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
			
 
				-	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
			
 
				-	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
			
 
				-	if (last_fsb == end_zero_fsb) {
			
 
				-		/*
			
 
				-		 * The size was only incremented on its last block.
			
 
				-		 * We took care of that above, so just return.
			
 
				-		 */
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	ASSERT(start_zero_fsb <= end_zero_fsb);
			
 
				-	while (start_zero_fsb <= end_zero_fsb) {
			
 
				-		nimaps = 1;
			
 
				-		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
			
 
				-
			
 
				-		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				-		error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
			
 
				-					  &imap, &nimaps, 0);
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				-
			
 
				-		ASSERT(nimaps > 0);
			
 
				-
			
 
				-		if (imap.br_state == XFS_EXT_UNWRITTEN ||
			
 
				-		    imap.br_startblock == HOLESTARTBLOCK) {
			
 
				-			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
			
 
				-			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * There are blocks we need to zero.
			
 
				-		 */
			
 
				-		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
			
 
				-		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
			
 
				-
			
 
				-		if ((zero_off + zero_len) > offset)
			
 
				-			zero_len = offset - zero_off;
			
 
				-
			
 
				-		error = xfs_iozero(ip, zero_off, zero_len);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				-
			
 
				-		*did_zeroing = true;
			
 
				-		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
			
 
				-		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				+	return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -841,7 +672,7 @@ xfs_file_buffered_aio_write(
 
				 write_retry:
			
 
				 	trace_xfs_file_buffered_write(ip, iov_iter_count(from),
			
 
				 				      iocb->ki_pos, 0);
			
 
				-	ret = generic_perform_write(file, from, iocb->ki_pos);
			
 
				+	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
			
 
				 	if (likely(ret >= 0))
			
 
				 		iocb->ki_pos += ret;
			
 
				 
			
@@ -1553,7 +1384,7 @@ xfs_filemap_page_mkwrite(
 
				 	if (IS_DAX(inode)) {
			
 
				 		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
			
 
				 	} else {
			
 
				-		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
			
 
				+		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
			
 
				 		ret = block_page_mkwrite_return(ret);
			
 
				 	}
			
 
				 
			
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -427,7 +427,8 @@ int	xfs_update_prealloc_flags(struct xfs_inode *ip,
 
				 				  enum xfs_prealloc_flags flags);
			
 
				 int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
			
 
				 		     xfs_fsize_t isize, bool *did_zeroing);
			
 
				-int	xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
			
 
				+int	xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
			
 
				+		bool *did_zero);
			
 
				 loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
			
 
				 			     loff_t eof, int whence);
			
 
				 
			
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
 
				  * along with this program; if not, write the Free Software Foundation,
			
 
				  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				  */
			
 
				+#include <linux/iomap.h>
			
 
				 #include "xfs.h"
			
 
				 #include "xfs_fs.h"
			
 
				 #include "xfs_shared.h"
			
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
 
				 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				 	return error;
			
 
				 }
			
 
				+
			
 
				+void
			
 
				+xfs_bmbt_to_iomap(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	struct iomap		*iomap,
			
 
				+	struct xfs_bmbt_irec	*imap)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+
			
 
				+	if (imap->br_startblock == HOLESTARTBLOCK) {
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->type = IOMAP_HOLE;
			
 
				+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->type = IOMAP_DELALLOC;
			
 
				+	} else {
			
 
				+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
			
 
				+		if (imap->br_state == XFS_EXT_UNWRITTEN)
			
 
				+			iomap->type = IOMAP_UNWRITTEN;
			
 
				+		else
			
 
				+			iomap->type = IOMAP_MAPPED;
			
 
				+	}
			
 
				+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
			
 
				+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
			
 
				+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
			
 
				+}
			
 
				+
			
 
				+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
			
 
				+{
			
 
				+	return !nimaps ||
			
 
				+		imap->br_startblock == HOLESTARTBLOCK ||
			
 
				+		imap->br_startblock == DELAYSTARTBLOCK;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_file_iomap_begin(
			
 
				+	struct inode		*inode,
			
 
				+	loff_t			offset,
			
 
				+	loff_t			length,
			
 
				+	unsigned		flags,
			
 
				+	struct iomap		*iomap)
			
 
				+{
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_bmbt_irec	imap;
			
 
				+	xfs_fileoff_t		offset_fsb, end_fsb;
			
 
				+	int			nimaps = 1, error = 0;
			
 
				+
			
 
				+	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+
			
 
				+	ASSERT(offset <= mp->m_super->s_maxbytes);
			
 
				+	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
			
 
				+		length = mp->m_super->s_maxbytes - offset;
			
 
				+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
			
 
				+
			
 
				+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
			
 
				+			       &nimaps, XFS_BMAPI_ENTIRE);
			
 
				+	if (error) {
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		return error;
			
 
				+	}
			
 
				+
			
 
				+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
			
 
				+		/*
			
 
				+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
			
 
				+		 * pages to keep the chunks of work done where somewhat symmetric
			
 
				+		 * with the work writeback does. This is a completely arbitrary
			
 
				+		 * number pulled out of thin air as a best guess for initial
			
 
				+		 * testing.
			
 
				+		 *
			
 
				+		 * Note that the values needs to be less than 32-bits wide until
			
 
				+		 * the lower level functions are updated.
			
 
				+		 */
			
 
				+		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
			
 
				+		if (xfs_get_extsz_hint(ip)) {
			
 
				+			/*
			
 
				+			 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				+			 * is unlocked on return.
			
 
				+			 */
			
 
				+			xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
			
 
				+			error = xfs_iomap_write_direct(ip, offset, length, &imap,
			
 
				+					nimaps);
			
 
				+		} else {
			
 
				+			error = xfs_iomap_write_delay(ip, offset, length, &imap);
			
 
				+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		}
			
 
				+
			
 
				+		if (error)
			
 
				+			return error;
			
 
				+
			
 
				+		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
			
 
				+		xfs_bmbt_to_iomap(ip, iomap, &imap);
			
 
				+	} else if (nimaps) {
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
			
 
				+		xfs_bmbt_to_iomap(ip, iomap, &imap);
			
 
				+	} else {
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->type = IOMAP_HOLE;
			
 
				+		iomap->offset = offset;
			
 
				+		iomap->length = length;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_file_iomap_end_delalloc(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	loff_t			offset,
			
 
				+	loff_t			length,
			
 
				+	ssize_t			written)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	xfs_fileoff_t		start_fsb;
			
 
				+	xfs_fileoff_t		end_fsb;
			
 
				+	int			error = 0;
			
 
				+
			
 
				+	start_fsb = XFS_B_TO_FSB(mp, offset + written);
			
 
				+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
			
 
				+
			
 
				+	/*
			
 
				+	 * Trim back delalloc blocks if we didn't manage to write the whole
			
 
				+	 * range reserved.
			
 
				+	 *
			
 
				+	 * We don't need to care about racing delalloc as we hold i_mutex
			
 
				+	 * across the reserve/allocate/unreserve calls. If there are delalloc
			
 
				+	 * blocks in the range, they are ours.
			
 
				+	 */
			
 
				+	if (start_fsb < end_fsb) {
			
 
				+		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
			
 
				+					       end_fsb - start_fsb);
			
 
				+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+
			
 
				+		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
			
 
				+			xfs_alert(mp, "%s: unable to clean up ino %lld",
			
 
				+				__func__, ip->i_ino);
			
 
				+			return error;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+xfs_file_iomap_end(
			
 
				+	struct inode		*inode,
			
 
				+	loff_t			offset,
			
 
				+	loff_t			length,
			
 
				+	ssize_t			written,
			
 
				+	unsigned		flags,
			
 
				+	struct iomap		*iomap)
			
 
				+{
			
 
				+	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
			
 
				+		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
			
 
				+				length, written);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+struct iomap_ops xfs_iomap_ops = {
			
 
				+	.iomap_begin		= xfs_file_iomap_begin,
			
 
				+	.iomap_end		= xfs_file_iomap_end,
			
 
				+};
			
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
 
				 #ifndef __XFS_IOMAP_H__
			
 
				 #define __XFS_IOMAP_H__
			
 
				 
			
 
				+#include <linux/iomap.h>
			
 
				+
			
 
				 struct xfs_inode;
			
 
				 struct xfs_bmbt_irec;
			
 
				 
			
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 
				 			struct xfs_bmbt_irec *);
			
 
				 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
			
 
				 
			
 
				+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
			
 
				+		struct xfs_bmbt_irec *);
			
 
				+
			
 
				+extern struct iomap_ops xfs_iomap_ops;
			
 
				+
			
 
				 #endif /* __XFS_IOMAP_H__*/
			
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
 
				 #include "xfs_dir2.h"
			
 
				 #include "xfs_trans_space.h"
			
 
				 #include "xfs_pnfs.h"
			
 
				+#include "xfs_iomap.h"
			
 
				 
			
 
				 #include <linux/capability.h>
			
 
				 #include <linux/xattr.h>
			
 
				 #include <linux/posix_acl.h>
			
 
				 #include <linux/security.h>
			
 
				-#include <linux/fiemap.h>
			
 
				+#include <linux/iomap.h>
			
 
				 #include <linux/slab.h>
			
 
				 
			
 
				 /*
			
@@ -800,21 +801,31 @@ xfs_setattr_size(
 
				 	if (error)
			
 
				 		return error;
			
 
				 
			
 
				+	/*
			
 
				+	 * Wait for all direct I/O to complete.
			
 
				+	 */
			
 
				+	inode_dio_wait(inode);
			
 
				+
			
 
				 	/*
			
 
				 	 * File data changes must be complete before we start the transaction to
			
 
				 	 * modify the inode.  This needs to be done before joining the inode to
			
 
				 	 * the transaction because the inode cannot be unlocked once it is a
			
 
				 	 * part of the transaction.
			
 
				 	 *
			
 
				-	 * Start with zeroing any data block beyond EOF that we may expose on
			
 
				-	 * file extension.
			
 
				+	 * Start with zeroing any data beyond EOF that we may expose on file
			
 
				+	 * extension, or zeroing out the rest of the block on a downward
			
 
				+	 * truncate.
			
 
				 	 */
			
 
				 	if (newsize > oldsize) {
			
 
				 		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
			
 
				-		if (error)
			
 
				-			return error;
			
 
				+	} else {
			
 
				+		error = iomap_truncate_page(inode, newsize, &did_zeroing,
			
 
				+				&xfs_iomap_ops);
			
 
				 	}
			
 
				 
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+
			
 
				 	/*
			
 
				 	 * We are going to log the inode size change in this transaction so
			
 
				 	 * any previous writes that are beyond the on disk EOF and the new
			
@@ -823,17 +834,14 @@ xfs_setattr_size(
 
				 	 * problem. Note that this includes any block zeroing we did above;
			
 
				 	 * otherwise those blocks may not be zeroed after a crash.
			
 
				 	 */
			
 
				-	if (newsize > ip->i_d.di_size &&
			
 
				-	    (oldsize != ip->i_d.di_size || did_zeroing)) {
			
 
				+	if (did_zeroing ||
			
 
				+	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
			
 
				 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
			
 
				 						      ip->i_d.di_size, newsize);
			
 
				 		if (error)
			
 
				 			return error;
			
 
				 	}
			
 
				 
			
 
				-	/* Now wait for all direct I/O to complete. */
			
 
				-	inode_dio_wait(inode);
			
 
				-
			
 
				 	/*
			
 
				 	 * We've already locked out new page faults, so now we can safely remove
			
 
				 	 * pages from the page cache knowing they won't get refaulted until we
			
@@ -851,13 +859,6 @@ xfs_setattr_size(
 
				 	 * to hope that the caller sees ENOMEM and retries the truncate
			
 
				 	 * operation.
			
 
				 	 */
			
 
				-	if (IS_DAX(inode))
			
 
				-		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
			
 
				-	else
			
 
				-		error = block_truncate_page(inode->i_mapping, newsize,
			
 
				-					    xfs_get_blocks);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				 	truncate_setsize(inode, newsize);
			
 
				 
			
 
				 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
			
@@ -998,51 +999,6 @@ xfs_vn_update_time(
 
				 	return xfs_trans_commit(tp);
			
 
				 }
			
 
				 
			
 
				-#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
			
 
				-
			
 
				-/*
			
 
				- * Call fiemap helper to fill in user data.
			
 
				- * Returns positive errors to xfs_getbmap.
			
 
				- */
			
 
				-STATIC int
			
 
				-xfs_fiemap_format(
			
 
				-	void			**arg,
			
 
				-	struct getbmapx		*bmv,
			
 
				-	int			*full)
			
 
				-{
			
 
				-	int			error;
			
 
				-	struct fiemap_extent_info *fieinfo = *arg;
			
 
				-	u32			fiemap_flags = 0;
			
 
				-	u64			logical, physical, length;
			
 
				-
			
 
				-	/* Do nothing for a hole */
			
 
				-	if (bmv->bmv_block == -1LL)
			
 
				-		return 0;
			
 
				-
			
 
				-	logical = BBTOB(bmv->bmv_offset);
			
 
				-	physical = BBTOB(bmv->bmv_block);
			
 
				-	length = BBTOB(bmv->bmv_length);
			
 
				-
			
 
				-	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
			
 
				-		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
			
 
				-	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
			
 
				-		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
			
 
				-				 FIEMAP_EXTENT_UNKNOWN);
			
 
				-		physical = 0;   /* no block yet */
			
 
				-	}
			
 
				-	if (bmv->bmv_oflags & BMV_OF_LAST)
			
 
				-		fiemap_flags |= FIEMAP_EXTENT_LAST;
			
 
				-
			
 
				-	error = fiemap_fill_next_extent(fieinfo, logical, physical,
			
 
				-					length, fiemap_flags);
			
 
				-	if (error > 0) {
			
 
				-		error = 0;
			
 
				-		*full = 1;	/* user array now full */
			
 
				-	}
			
 
				-
			
 
				-	return error;
			
 
				-}
			
 
				-
			
 
				 STATIC int
			
 
				 xfs_vn_fiemap(
			
 
				 	struct inode		*inode,
			
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
 
				 	u64			start,
			
 
				 	u64			length)
			
 
				 {
			
 
				-	xfs_inode_t		*ip = XFS_I(inode);
			
 
				-	struct getbmapx		bm;
			
 
				 	int			error;
			
 
				 
			
 
				-	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				-
			
 
				-	/* Set up bmap header for xfs internal routine */
			
 
				-	bm.bmv_offset = BTOBBT(start);
			
 
				-	/* Special case for whole file */
			
 
				-	if (length == FIEMAP_MAX_OFFSET)
			
 
				-		bm.bmv_length = -1LL;
			
 
				-	else
			
 
				-		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
			
 
				-
			
 
				-	/* We add one because in getbmap world count includes the header */
			
 
				-	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
			
 
				-					fieinfo->fi_extents_max + 1;
			
 
				-	bm.bmv_count = min_t(__s32, bm.bmv_count,
			
 
				-			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
			
 
				-	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
			
 
				-	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
			
 
				-		bm.bmv_iflags |= BMV_IF_ATTRFORK;
			
 
				-	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
			
 
				-		bm.bmv_iflags |= BMV_IF_DELALLOC;
			
 
				-
			
 
				-	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				+	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
			
 
				+	error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
			
 
				+	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
			
 
				 
			
 
				-	return 0;
			
 
				+	return error;
			
 
				 }
			
 
				 
			
 
				 STATIC int
			
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -80,32 +80,6 @@ xfs_fs_get_uuid(
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void
			
 
				-xfs_bmbt_to_iomap(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	struct iomap		*iomap,
			
 
				-	struct xfs_bmbt_irec	*imap)
			
 
				-{
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-
			
 
				-	if (imap->br_startblock == HOLESTARTBLOCK) {
			
 
				-		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				-		iomap->type = IOMAP_HOLE;
			
 
				-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
			
 
				-		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				-		iomap->type = IOMAP_DELALLOC;
			
 
				-	} else {
			
 
				-		iomap->blkno =
			
 
				-			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
			
 
				-		if (imap->br_state == XFS_EXT_UNWRITTEN)
			
 
				-			iomap->type = IOMAP_UNWRITTEN;
			
 
				-		else
			
 
				-			iomap->type = IOMAP_MAPPED;
			
 
				-	}
			
 
				-	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
			
 
				-	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Get a layout for the pNFS client.
			
 
				  */
			
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 
				 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
			
 
				 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
			
 
				 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
			
 
				+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
			
 
				+DEFINE_IOMAP_EVENT(xfs_iomap_found);
			
 
				+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(xfs_simple_io_class,
			
 
				 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),