10 years ago · 264e89ad34
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -29,6 +29,11 @@
 
				 #include <linux/uio.h>
			
 
				 #include <linux/vmstat.h>
			
 
				 
			
 
				+/*
			
 
				+ * dax_clear_blocks() is called from within transaction context from XFS,
			
 
				+ * and hence this means the stack from this point must follow GFP_NOFS
			
 
				+ * semantics for all operations.
			
 
				+ */
			
 
				 int dax_clear_blocks(struct inode *inode, sector_t block, long size)
			
 
				 {
			
 
				 	struct block_device *bdev = inode->i_sb->s_bdev;
			
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2509,7 +2509,7 @@ xfs_alloc_vextent(
 
				 		 * Try near allocation first, then anywhere-in-ag after
			
 
				 		 * the first a.g. fails.
			
 
				 		 */
			
 
				-		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
			
 
				+		if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
			
 
				 		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
			
 
				 			args->fsbno = XFS_AGB_TO_FSB(mp,
			
 
				 					((mp->m_agfrotor / rotorstep) %
			
@@ -2640,6 +2640,14 @@ xfs_alloc_vextent(
 
				 		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
			
 
				 			args->len);
			
 
				 #endif
			
 
				+
			
 
				+		/* Zero the extent if we were asked to do so */
			
 
				+		if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
			
 
				+			error = xfs_zero_extent(args->ip, args->fsbno, args->len);
			
 
				+			if (error)
			
 
				+				goto error0;
			
 
				+		}
			
 
				+
			
 
				 	}
			
 
				 	xfs_perag_put(args->pag);
			
 
				 	return 0;
			
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
 
				 	struct xfs_mount *mp;		/* file system mount point */
			
 
				 	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
			
 
				 	struct xfs_perag *pag;		/* per-ag struct for this agno */
			
 
				+	struct xfs_inode *ip;		/* for userdata zeroing method */
			
 
				 	xfs_fsblock_t	fsbno;		/* file system block number */
			
 
				 	xfs_agnumber_t	agno;		/* allocation group number */
			
 
				 	xfs_agblock_t	agbno;		/* allocation group-relative block # */
			
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
 
				 	char		wasdel;		/* set if allocation was prev delayed */
			
 
				 	char		wasfromfl;	/* set if allocation is from freelist */
			
 
				 	char		isfl;		/* set if is freelist blocks - !acctg */
			
 
				-	char		userdata;	/* set if this is user data */
			
 
				+	char		userdata;	/* mask defining userdata treatment */
			
 
				 	xfs_fsblock_t	firstblock;	/* io first block allocated */
			
 
				 } xfs_alloc_arg_t;
			
 
				 
			
 
				 /*
			
 
				  * Defines for userdata
			
 
				  */
			
 
				-#define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
			
 
				-#define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
			
 
				+#define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/
			
 
				+#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */
			
 
				+#define XFS_ALLOC_USERDATA_ZERO		(1 << 2)/* zero extent on allocation */
			
 
				 
			
 
				 xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
			
 
				 		struct xfs_perag *pag, xfs_extlen_t need);
			
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3802,8 +3802,13 @@ xfs_bmap_btalloc(
 
				 	args.wasdel = ap->wasdel;
			
 
				 	args.isfl = 0;
			
 
				 	args.userdata = ap->userdata;
			
 
				-	if ((error = xfs_alloc_vextent(&args)))
			
 
				+	if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
			
 
				+		args.ip = ap->ip;
			
 
				+
			
 
				+	error = xfs_alloc_vextent(&args);
			
 
				+	if (error)
			
 
				 		return error;
			
 
				+
			
 
				 	if (tryagain && args.fsbno == NULLFSBLOCK) {
			
 
				 		/*
			
 
				 		 * Exact allocation failed. Now try with alignment
			
@@ -4302,11 +4307,14 @@ xfs_bmapi_allocate(
 
				 
			
 
				 	/*
			
 
				 	 * Indicate if this is the first user data in the file, or just any
			
 
				-	 * user data.
			
 
				+	 * user data. And if it is userdata, indicate whether it needs to
			
 
				+	 * be initialised to zero during allocation.
			
 
				 	 */
			
 
				 	if (!(bma->flags & XFS_BMAPI_METADATA)) {
			
 
				 		bma->userdata = (bma->offset == 0) ?
			
 
				 			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
			
 
				+		if (bma->flags & XFS_BMAPI_ZERO)
			
 
				+			bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
			
 
				 	}
			
 
				 
			
 
				 	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
			
@@ -4421,6 +4429,17 @@ xfs_bmapi_convert_unwritten(
 
				 	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
			
 
				 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
			
 
				 
			
 
				+	/*
			
 
				+	 * Before insertion into the bmbt, zero the range being converted
			
 
				+	 * if required.
			
 
				+	 */
			
 
				+	if (flags & XFS_BMAPI_ZERO) {
			
 
				+		error = xfs_zero_extent(bma->ip, mval->br_startblock,
			
 
				+					mval->br_blockcount);
			
 
				+		if (error)
			
 
				+			return error;
			
 
				+	}
			
 
				+
			
 
				 	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
			
 
				 			&bma->cur, mval, bma->firstblock, bma->flist,
			
 
				 			&tmp_logflags);
			
@@ -4514,6 +4533,18 @@ xfs_bmapi_write(
 
				 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
			
 
				 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
			
 
				 
			
 
				+	/* zeroing is for currently only for data extents, not metadata */
			
 
				+	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
			
 
				+			(XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
			
 
				+	/*
			
 
				+	 * we can allocate unwritten extents or pre-zero allocated blocks,
			
 
				+	 * but it makes no sense to do both at once. This would result in
			
 
				+	 * zeroing the unwritten extent twice, but it still being an
			
 
				+	 * unwritten extent....
			
 
				+	 */
			
 
				+	ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
			
 
				+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
			
 
				+
			
 
				 	if (unlikely(XFS_TEST_ERROR(
			
 
				 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
			
 
				 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
			
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
 
				 	xfs_extlen_t		minleft; /* amount must be left after alloc */
			
 
				 	bool			eof;	/* set if allocating past last extent */
			
 
				 	bool			wasdel;	/* replacing a delayed allocation */
			
 
				-	bool			userdata;/* set if is user data */
			
 
				 	bool			aeof;	/* allocated space at eof */
			
 
				 	bool			conv;	/* overwriting unwritten extents */
			
 
				+	char			userdata;/* userdata mask */
			
 
				 	int			flags;
			
 
				 };
			
 
				 
			
@@ -109,6 +109,14 @@ typedef	struct xfs_bmap_free
 
				  */
			
 
				 #define XFS_BMAPI_CONVERT	0x040
			
 
				 
			
 
				+/*
			
 
				+ * allocate zeroed extents - this requires all newly allocated user data extents
			
 
				+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
			
 
				+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
			
 
				+ * during the allocation range to zeroed written extents.
			
 
				+ */
			
 
				+#define XFS_BMAPI_ZERO		0x080
			
 
				+
			
 
				 #define XFS_BMAPI_FLAGS \
			
 
				 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
			
 
				 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
			
@@ -116,7 +124,8 @@ typedef	struct xfs_bmap_free
 
				 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
			
 
				 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
			
 
				 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
			
 
				-	{ XFS_BMAPI_CONVERT,	"CONVERT" }
			
 
				+	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
			
 
				+	{ XFS_BMAPI_ZERO,	"ZERO" }
			
 
				 
			
 
				 
			
 
				 static inline int xfs_bmapi_aflag(int w)
			
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1259,13 +1259,28 @@ xfs_vm_releasepage(
 
				  * the DIO. There is only going to be one reference to the ioend and its life
			
 
				  * cycle is constrained by the DIO completion code. hence we don't need
			
 
				  * reference counting here.
			
 
				+ *
			
 
				+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
			
 
				+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
			
 
				+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
			
 
				+ * extending the file size. We won't know for sure until IO completion is run
			
 
				+ * and the actual max write offset is communicated to the IO completion
			
 
				+ * routine.
			
 
				+ *
			
 
				+ * For DAX page faults, we are preparing to never see unwritten extents here,
			
 
				+ * nor should we ever extend the inode size. Hence we will soon have nothing to
			
 
				+ * do here for this case, ensuring we don't have to provide an IO completion
			
 
				+ * callback to free an ioend that we don't actually need for a fault into the
			
 
				+ * page at offset (2^63 - 1FSB) bytes.
			
 
				  */
			
 
				+
			
 
				 static void
			
 
				 xfs_map_direct(
			
 
				 	struct inode		*inode,
			
 
				 	struct buffer_head	*bh_result,
			
 
				 	struct xfs_bmbt_irec	*imap,
			
 
				-	xfs_off_t		offset)
			
 
				+	xfs_off_t		offset,
			
 
				+	bool			dax_fault)
			
 
				 {
			
 
				 	struct xfs_ioend	*ioend;
			
 
				 	xfs_off_t		size = bh_result->b_size;
			
@@ -1278,6 +1293,13 @@ xfs_map_direct(
 
				 
			
 
				 	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
			
 
				 
			
 
				+	if (dax_fault) {
			
 
				+		ASSERT(type == XFS_IO_OVERWRITE);
			
 
				+		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
			
 
				+					    imap);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	if (bh_result->b_private) {
			
 
				 		ioend = bh_result->b_private;
			
 
				 		ASSERT(ioend->io_size > 0);
			
@@ -1292,7 +1314,8 @@ xfs_map_direct(
 
				 					      ioend->io_size, ioend->io_type,
			
 
				 					      imap);
			
 
				 	} else if (type == XFS_IO_UNWRITTEN ||
			
 
				-		   offset + size > i_size_read(inode)) {
			
 
				+		   offset + size > i_size_read(inode) ||
			
 
				+		   offset + size < 0) {
			
 
				 		ioend = xfs_alloc_ioend(inode, type);
			
 
				 		ioend->io_offset = offset;
			
 
				 		ioend->io_size = size;
			
@@ -1354,7 +1377,8 @@ __xfs_get_blocks(
 
				 	sector_t		iblock,
			
 
				 	struct buffer_head	*bh_result,
			
 
				 	int			create,
			
 
				-	bool			direct)
			
 
				+	bool			direct,
			
 
				+	bool			dax_fault)
			
 
				 {
			
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				 	struct xfs_mount	*mp = ip->i_mount;
			
@@ -1402,10 +1426,12 @@ __xfs_get_blocks(
 
				 	if (error)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				+	/* for DAX, we convert unwritten extents directly */
			
 
				 	if (create &&
			
 
				 	    (!nimaps ||
			
 
				 	     (imap.br_startblock == HOLESTARTBLOCK ||
			
 
				-	      imap.br_startblock == DELAYSTARTBLOCK))) {
			
 
				+	      imap.br_startblock == DELAYSTARTBLOCK) ||
			
 
				+	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
			
 
				 		if (direct || xfs_get_extsz_hint(ip)) {
			
 
				 			/*
			
 
				 			 * xfs_iomap_write_direct() expects the shared lock. It
			
@@ -1450,6 +1476,12 @@ __xfs_get_blocks(
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				+	if (IS_DAX(inode) && create) {
			
 
				+		ASSERT(!ISUNWRITTEN(&imap));
			
 
				+		/* zeroing is not needed at a higher layer */
			
 
				+		new = 0;
			
 
				+	}
			
 
				+
			
 
				 	/* trim mapping down to size requested */
			
 
				 	if (direct || size > (1 << inode->i_blkbits))
			
 
				 		xfs_map_trim_size(inode, iblock, bh_result,
			
@@ -1467,7 +1499,8 @@ __xfs_get_blocks(
 
				 			set_buffer_unwritten(bh_result);
			
 
				 		/* direct IO needs special help */
			
 
				 		if (create && direct)
			
 
				-			xfs_map_direct(inode, bh_result, &imap, offset);
			
 
				+			xfs_map_direct(inode, bh_result, &imap, offset,
			
 
				+				       dax_fault);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -1514,7 +1547,7 @@ xfs_get_blocks(
 
				 	struct buffer_head	*bh_result,
			
 
				 	int			create)
			
 
				 {
			
 
				-	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
			
 
				+	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
			
 
				 }
			
 
				 
			
 
				 int
			
@@ -1524,7 +1557,17 @@ xfs_get_blocks_direct(
 
				 	struct buffer_head	*bh_result,
			
 
				 	int			create)
			
 
				 {
			
 
				-	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
			
 
				+	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+xfs_get_blocks_dax_fault(
			
 
				+	struct inode		*inode,
			
 
				+	sector_t		iblock,
			
 
				+	struct buffer_head	*bh_result,
			
 
				+	int			create)
			
 
				+{
			
 
				+	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -1623,45 +1666,6 @@ xfs_end_io_direct_write(
 
				 	__xfs_end_io_direct_write(inode, ioend, offset, size);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * For DAX we need a mapping buffer callback for unwritten extent conversion
			
 
				- * when page faults allocate blocks and then zero them. Note that in this
			
 
				- * case the mapping indicated by the ioend may extend beyond EOF. We most
			
 
				- * definitely do not want to extend EOF here, so we trim back the ioend size to
			
 
				- * EOF.
			
 
				- */
			
 
				-#ifdef CONFIG_FS_DAX
			
 
				-void
			
 
				-xfs_end_io_dax_write(
			
 
				-	struct buffer_head	*bh,
			
 
				-	int			uptodate)
			
 
				-{
			
 
				-	struct xfs_ioend	*ioend = bh->b_private;
			
 
				-	struct inode		*inode = ioend->io_inode;
			
 
				-	ssize_t			size = ioend->io_size;
			
 
				-
			
 
				-	ASSERT(IS_DAX(ioend->io_inode));
			
 
				-
			
 
				-	/* if there was an error zeroing, then don't convert it */
			
 
				-	if (!uptodate)
			
 
				-		ioend->io_error = -EIO;
			
 
				-
			
 
				-	/*
			
 
				-	 * Trim update to EOF, so we don't extend EOF during unwritten extent
			
 
				-	 * conversion of partial EOF blocks.
			
 
				-	 */
			
 
				-	spin_lock(&XFS_I(inode)->i_flags_lock);
			
 
				-	if (ioend->io_offset + size > i_size_read(inode))
			
 
				-		size = i_size_read(inode) - ioend->io_offset;
			
 
				-	spin_unlock(&XFS_I(inode)->i_flags_lock);
			
 
				-
			
 
				-	__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
			
 
				-
			
 
				-}
			
 
				-#else
			
 
				-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
			
 
				-#endif
			
 
				-
			
 
				 static inline ssize_t
			
 
				 xfs_vm_do_dio(
			
 
				 	struct inode		*inode,
			
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -58,7 +58,8 @@ int	xfs_get_blocks(struct inode *inode, sector_t offset,
 
				 		       struct buffer_head *map_bh, int create);
			
 
				 int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
			
 
				 			      struct buffer_head *map_bh, int create);
			
 
				-void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
			
 
				+int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
			
 
				+			         struct buffer_head *map_bh, int create);
			
 
				 
			
 
				 extern void xfs_count_page_state(struct page *, int *, int *);
			
 
				 
			
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -56,6 +56,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 
				 		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Routine to zero an extent on disk allocated to the specific inode.
			
 
				+ *
			
 
				+ * The VFS functions take a linearised filesystem block offset, so we have to
			
 
				+ * convert the sparse xfs fsb to the right format first.
			
 
				+ * VFS types are real funky, too.
			
 
				+ */
			
 
				+int
			
 
				+xfs_zero_extent(
			
 
				+	struct xfs_inode *ip,
			
 
				+	xfs_fsblock_t	start_fsb,
			
 
				+	xfs_off_t	count_fsb)
			
 
				+{
			
 
				+	struct xfs_mount *mp = ip->i_mount;
			
 
				+	xfs_daddr_t	sector = xfs_fsb_to_db(ip, start_fsb);
			
 
				+	sector_t	block = XFS_BB_TO_FSBT(mp, sector);
			
 
				+	ssize_t		size = XFS_FSB_TO_B(mp, count_fsb);
			
 
				+
			
 
				+	if (IS_DAX(VFS_I(ip)))
			
 
				+		return dax_clear_blocks(VFS_I(ip), block, size);
			
 
				+
			
 
				+	/*
			
 
				+	 * let the block layer decide on the fastest method of
			
 
				+	 * implementing the zeroing.
			
 
				+	 */
			
 
				+	return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
			
 
				+
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
			
 
				  * caller.  Frees all the extents that need freeing, which must be done
			
@@ -229,6 +258,13 @@ xfs_bmap_rtalloc(
 
				 		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
			
 
				 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
			
 
				 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
			
 
				+
			
 
				+		/* Zero the extent if we were asked to do so */
			
 
				+		if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
			
 
				+			error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
			
 
				+			if (error)
			
 
				+				return error;
			
 
				+		}
			
 
				 	} else {
			
 
				 		ap->length = 0;
			
 
				 	}
			
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1493,7 +1493,7 @@ xfs_file_llseek(
 
				  *
			
 
				  * mmap_sem (MM)
			
 
				  *   sb_start_pagefault(vfs, freeze)
			
 
				- *     i_mmap_lock (XFS - truncate serialisation)
			
 
				+ *     i_mmaplock (XFS - truncate serialisation)
			
 
				  *       page_lock (MM)
			
 
				  *         i_lock (XFS - extent map serialisation)
			
 
				  */
			
@@ -1519,8 +1519,7 @@ xfs_filemap_page_mkwrite(
 
				 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
 
				 
			
 
				 	if (IS_DAX(inode)) {
			
 
				-		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
			
 
				-				    xfs_end_io_dax_write);
			
 
				+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
			
 
				 	} else {
			
 
				 		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
			
 
				 		ret = block_page_mkwrite_return(ret);
			
@@ -1554,7 +1553,7 @@ xfs_filemap_fault(
 
				 		 * changes to xfs_get_blocks_direct() to map unwritten extent
			
 
				 		 * ioend for conversion on read-only mappings.
			
 
				 		 */
			
 
				-		ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
			
 
				+		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
			
 
				 	} else
			
 
				 		ret = filemap_fault(vma, vmf);
			
 
				 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
@@ -1562,6 +1561,13 @@ xfs_filemap_fault(
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
			
 
				+ * both read and write faults. Hence we need to handle both cases. There is no
			
 
				+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
			
 
				+ * handle both cases here. @flags carries the information on the type of fault
			
 
				+ * occuring.
			
 
				+ */
			
 
				 STATIC int
			
 
				 xfs_filemap_pmd_fault(
			
 
				 	struct vm_area_struct	*vma,
			
@@ -1578,15 +1584,54 @@ xfs_filemap_pmd_fault(
 
				 
			
 
				 	trace_xfs_filemap_pmd_fault(ip);
			
 
				 
			
 
				-	sb_start_pagefault(inode->i_sb);
			
 
				-	file_update_time(vma->vm_file);
			
 
				+	if (flags & FAULT_FLAG_WRITE) {
			
 
				+		sb_start_pagefault(inode->i_sb);
			
 
				+		file_update_time(vma->vm_file);
			
 
				+	}
			
 
				+
			
 
				 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
 
				-	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
			
 
				-				    xfs_end_io_dax_write);
			
 
				+	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
			
 
				+			      NULL);
			
 
				 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
 
				-	sb_end_pagefault(inode->i_sb);
			
 
				 
			
 
				+	if (flags & FAULT_FLAG_WRITE)
			
 
				+		sb_end_pagefault(inode->i_sb);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
			
 
				+ * updates on write faults. In reality, it's need to serialise against
			
 
				+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
			
 
				+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
			
 
				+ * barrier in place.
			
 
				+ */
			
 
				+static int
			
 
				+xfs_filemap_pfn_mkwrite(
			
 
				+	struct vm_area_struct	*vma,
			
 
				+	struct vm_fault		*vmf)
			
 
				+{
			
 
				+
			
 
				+	struct inode		*inode = file_inode(vma->vm_file);
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	int			ret = VM_FAULT_NOPAGE;
			
 
				+	loff_t			size;
			
 
				+
			
 
				+	trace_xfs_filemap_pfn_mkwrite(ip);
			
 
				+
			
 
				+	sb_start_pagefault(inode->i_sb);
			
 
				+	file_update_time(vma->vm_file);
			
 
				+
			
 
				+	/* check if the faulting page hasn't raced with truncate */
			
 
				+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
			
 
				+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
			
 
				+	if (vmf->pgoff >= size)
			
 
				+		ret = VM_FAULT_SIGBUS;
			
 
				+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
			
 
				+	sb_end_pagefault(inode->i_sb);
			
 
				 	return ret;
			
 
				+
			
 
				 }
			
 
				 
			
 
				 static const struct vm_operations_struct xfs_file_vm_ops = {
			
@@ -1594,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
 
				 	.pmd_fault	= xfs_filemap_pmd_fault,
			
 
				 	.map_pages	= filemap_map_pages,
			
 
				 	.page_mkwrite	= xfs_filemap_page_mkwrite,
			
 
				+	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
			
 
				 };
			
 
				 
			
 
				 STATIC int
			
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
 
				 	int		committed;
			
 
				 	int		error;
			
 
				 	int		lockmode;
			
 
				+	int		bmapi_flags = XFS_BMAPI_PREALLOC;
			
 
				 
			
 
				 	rt = XFS_IS_REALTIME_INODE(ip);
			
 
				 	extsz = xfs_get_extsz_hint(ip);
			
@@ -195,6 +196,23 @@ xfs_iomap_write_direct(
 
				 	 * Allocate and setup the transaction
			
 
				 	 */
			
 
				 	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
			
 
				+
			
 
				+	/*
			
 
				+	 * For DAX, we do not allocate unwritten extents, but instead we zero
			
 
				+	 * the block before we commit the transaction.  Ideally we'd like to do
			
 
				+	 * this outside the transaction context, but if we commit and then crash
			
 
				+	 * we may not have zeroed the blocks and this will be exposed on
			
 
				+	 * recovery of the allocation. Hence we must zero before commit.
			
 
				+	 * Further, if we are mapping unwritten extents here, we need to zero
			
 
				+	 * and convert them to written so that we don't need an unwritten extent
			
 
				+	 * callback for DAX. This also means that we need to be able to dip into
			
 
				+	 * the reserve block pool if there is no space left but we need to do
			
 
				+	 * unwritten extent conversion.
			
 
				+	 */
			
 
				+	if (IS_DAX(VFS_I(ip))) {
			
 
				+		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
			
 
				+		tp->t_flags |= XFS_TRANS_RESERVE;
			
 
				+	}
			
 
				 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
			
 
				 				  resblks, resrtextents);
			
 
				 	/*
			
@@ -221,7 +239,7 @@ xfs_iomap_write_direct(
 
				 	xfs_bmap_init(&free_list, &firstfsb);
			
 
				 	nimaps = 1;
			
 
				 	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
			
 
				-				XFS_BMAPI_PREALLOC, &firstfsb, resblks, imap,
			
 
				+				bmapi_flags, &firstfsb, resblks, imap,
			
 
				 				&nimaps, &free_list);
			
 
				 	if (error)
			
 
				 		goto out_bmap_cancel;
			
@@ -232,6 +250,7 @@ xfs_iomap_write_direct(
 
				 	error = xfs_bmap_finish(&tp, &free_list, &committed);
			
 
				 	if (error)
			
 
				 		goto out_bmap_cancel;
			
 
				+
			
 
				 	error = xfs_trans_commit(tp);
			
 
				 	if (error)
			
 
				 		goto out_unlock;
			
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -338,4 +338,7 @@ extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 
				 
			
 
				 extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
			
 
				 
			
 
				+int	xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
			
 
				+			xfs_off_t count_fsb);
			
 
				+
			
 
				 #endif	/* __XFS_MOUNT_H__ */
			
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 
				 DEFINE_INODE_EVENT(xfs_filemap_fault);
			
 
				 DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
			
 
				 DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
			
 
				+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(xfs_iref_class,
			
 
				 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),