10 жил өмнө · 66e8ac7bfa
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 
															  out:
														
 
															 	i_mmap_unlock_read(mapping);
														
 
															-	if (bh->b_end_io)
														
 
															-		bh->b_end_io(bh, 1);
														
 
															-
														
 
															 	return error;
														
 
															 }
														
 
															-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
														
 
															-			get_block_t get_block)
														
 
															+/**
														
 
															+ * __dax_fault - handle a page fault on a DAX file
														
 
															+ * @vma: The virtual memory area where the fault occurred
														
 
															+ * @vmf: The description of the fault
														
 
															+ * @get_block: The filesystem method used to translate file offsets to blocks
														
 
															+ *
														
 
															+ * When a page fault occurs, filesystems may call this helper in their
														
 
															+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
														
 
															+ * the necessary locking for the page fault to proceed successfully.
														
 
															+ */
														
 
															+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
														
 
															+			get_block_t get_block, dax_iodone_t complete_unwritten)
														
 
															 {
														
 
															 	struct file *file = vma->vm_file;
														
 
															 	struct address_space *mapping = file->f_mapping;
														
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
															 		page_cache_release(page);
														
 
															 	}
														
 
															+	/*
														
 
															+	 * If we successfully insert the new mapping over an unwritten extent,
														
 
															+	 * we need to ensure we convert the unwritten extent. If there is an
														
 
															+	 * error inserting the mapping, the filesystem needs to leave it as
														
 
															+	 * unwritten to prevent exposure of the stale underlying data to
														
 
															+	 * userspace, but we still need to call the completion function so
														
 
															+	 * the private resources on the mapping buffer can be released. We
														
 
															+	 * indicate what the callback should do via the uptodate variable, same
														
 
															+	 * as for normal BH based IO completions.
														
 
															+	 */
														
 
															 	error = dax_insert_mapping(inode, &bh, vma, vmf);
														
 
															+	if (buffer_unwritten(&bh))
														
 
															+		complete_unwritten(&bh, !error);
														
 
															  out:
														
 
															 	if (error == -ENOMEM)
														
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
															 	}
														
 
															 	goto out;
														
 
															 }
														
 
															+EXPORT_SYMBOL(__dax_fault);
														
 
															 /**
														
 
															  * dax_fault - handle a page fault on a DAX file
														
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
															  * fault handler for DAX files.
														
 
															  */
														
 
															 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
														
 
															-			get_block_t get_block)
														
 
															+	      get_block_t get_block, dax_iodone_t complete_unwritten)
														
 
															 {
														
 
															 	int result;
														
 
															 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
														
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
															 		sb_start_pagefault(sb);
														
 
															 		file_update_time(vma->vm_file);
														
 
															 	}
														
 
															-	result = do_dax_fault(vma, vmf, get_block);
														
 
															+	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
														
 
															 	if (vmf->flags & FAULT_FLAG_WRITE)
														
 
															 		sb_end_pagefault(sb);
														
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
 
															 #ifdef CONFIG_FS_DAX
														
 
															 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
														
 
															 {
														
 
															-	return dax_fault(vma, vmf, ext2_get_block);
														
 
															+	return dax_fault(vma, vmf, ext2_get_block, NULL);
														
 
															 }
														
 
															 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
														
 
															 {
														
 
															-	return dax_mkwrite(vma, vmf, ext2_get_block);
														
 
															+	return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
														
 
															 }
														
 
															 static const struct vm_operations_struct ext2_dax_vm_ops = {
														
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
 
															 }
														
 
															 #ifdef CONFIG_FS_DAX
														
 
															+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
														
 
															+{
														
 
															+	struct inode *inode = bh->b_assoc_map->host;
														
 
															+	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
														
 
															+	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
														
 
															+	int err;
														
 
															+	if (!uptodate)
														
 
															+		return;
														
 
															+	WARN_ON(!buffer_unwritten(bh));
														
 
															+	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
														
 
															+}
														
 
															+
														
 
															 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
														
 
															 {
														
 
															-	return dax_fault(vma, vmf, ext4_get_block);
														
 
															+	return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
														
 
															 					/* Is this the right get_block? */
														
 
															 }
														
 
															 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
														
 
															 {
														
 
															-	return dax_mkwrite(vma, vmf, ext4_get_block);
														
 
															+	return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
														
 
															 }
														
 
															 static const struct vm_operations_struct ext4_dax_vm_ops = {
														
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
 
															 	return retval;
														
 
															 }
														
 
															-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
														
 
															-{
														
 
															-	struct inode *inode = bh->b_assoc_map->host;
														
 
															-	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
														
 
															-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
														
 
															-	int err;
														
 
															-	if (!uptodate)
														
 
															-		return;
														
 
															-	WARN_ON(!buffer_unwritten(bh));
														
 
															-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
														
 
															-}
														
 
															-
														
 
															 /* Maximum number of blocks we map for direct IO at once. */
														
 
															 #define DIO_MAX_BLOCKS 4096
														
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
															 		map_bh(bh, inode->i_sb, map.m_pblk);
														
 
															 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
														
 
															-		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
														
 
															+		if (IS_DAX(inode) && buffer_unwritten(bh)) {
														
 
															+			/*
														
 
															+			 * dgc: I suspect unwritten conversion on ext4+DAX is
														
 
															+			 * fundamentally broken here when there are concurrent
														
 
															+			 * read/write in progress on this inode.
														
 
															+			 */
														
 
															+			WARN_ON_ONCE(io_end);
														
 
															 			bh->b_assoc_map = inode->i_mapping;
														
 
															 			bh->b_private = (void *)(unsigned long)iblock;
														
 
															-			bh->b_end_io = ext4_end_io_unwritten;
														
 
															 		}
														
 
															 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
														
 
															 			set_buffer_defer_completion(bh);
														
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1349,7 +1349,7 @@ __xfs_get_blocks(
 
															 	sector_t		iblock,
														
 
															 	struct buffer_head	*bh_result,
														
 
															 	int			create,
														
 
															-	int			direct)
														
 
															+	bool			direct)
														
 
															 {
														
 
															 	struct xfs_inode	*ip = XFS_I(inode);
														
 
															 	struct xfs_mount	*mp = ip->i_mount;
														
@@ -1414,6 +1414,7 @@ __xfs_get_blocks(
 
															 			if (error)
														
 
															 				return error;
														
 
															 			new = 1;
														
 
															+
														
 
															 		} else {
														
 
															 			/*
														
 
															 			 * Delalloc reservations do not require a transaction,
														
@@ -1508,49 +1509,29 @@ xfs_get_blocks(
 
															 	struct buffer_head	*bh_result,
														
 
															 	int			create)
														
 
															 {
														
 
															-	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
														
 
															+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
														
 
															 }
														
 
															-STATIC int
														
 
															+int
														
 
															 xfs_get_blocks_direct(
														
 
															 	struct inode		*inode,
														
 
															 	sector_t		iblock,
														
 
															 	struct buffer_head	*bh_result,
														
 
															 	int			create)
														
 
															 {
														
 
															-	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
														
 
															+	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
														
 
															 }
														
 
															-/*
														
 
															- * Complete a direct I/O write request.
														
 
															- *
														
 
															- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
														
 
															- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
														
 
															- * wholly within the EOF and so there is nothing for us to do. Note that in this
														
 
															- * case the completion can be called in interrupt context, whereas if we have an
														
 
															- * ioend we will always be called in task context (i.e. from a workqueue).
														
 
															- */
														
 
															-STATIC void
														
 
															-xfs_end_io_direct_write(
														
 
															-	struct kiocb		*iocb,
														
 
															+static void
														
 
															+__xfs_end_io_direct_write(
														
 
															+	struct inode		*inode,
														
 
															+	struct xfs_ioend	*ioend,
														
 
															 	loff_t			offset,
														
 
															-	ssize_t			size,
														
 
															-	void			*private)
														
 
															+	ssize_t			size)
														
 
															 {
														
 
															-	struct inode		*inode = file_inode(iocb->ki_filp);
														
 
															-	struct xfs_inode	*ip = XFS_I(inode);
														
 
															-	struct xfs_mount	*mp = ip->i_mount;
														
 
															-	struct xfs_ioend	*ioend = private;
														
 
															-
														
 
															-	trace_xfs_gbmap_direct_endio(ip, offset, size,
														
 
															-				     ioend ? ioend->io_type : 0, NULL);
														
 
															+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
														
 
															-	if (!ioend) {
														
 
															-		ASSERT(offset + size <= i_size_read(inode));
														
 
															-		return;
														
 
															-	}
														
 
															-
														
 
															-	if (XFS_FORCED_SHUTDOWN(mp))
														
 
															+	if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
														
 
															 		goto out_end_io;
														
 
															 	/*
														
@@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
 
															 	 * here can result in EOF moving backwards and Bad Things Happen when
														
 
															 	 * that occurs.
														
 
															 	 */
														
 
															-	spin_lock(&ip->i_flags_lock);
														
 
															+	spin_lock(&XFS_I(inode)->i_flags_lock);
														
 
															 	if (offset + size > i_size_read(inode))
														
 
															 		i_size_write(inode, offset + size);
														
 
															-	spin_unlock(&ip->i_flags_lock);
														
 
															+	spin_unlock(&XFS_I(inode)->i_flags_lock);
														
 
															 	/*
														
 
															 	 * If we are doing an append IO that needs to update the EOF on disk,
														
@@ -1607,6 +1588,98 @@ out_end_io:
 
															 	return;
														
 
															 }
														
 
															+/*
														
 
															+ * Complete a direct I/O write request.
														
 
															+ *
														
 
															+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
														
 
															+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
														
 
															+ * wholly within the EOF and so there is nothing for us to do. Note that in this
														
 
															+ * case the completion can be called in interrupt context, whereas if we have an
														
 
															+ * ioend we will always be called in task context (i.e. from a workqueue).
														
 
															+ */
														
 
															+STATIC void
														
 
															+xfs_end_io_direct_write(
														
 
															+	struct kiocb		*iocb,
														
 
															+	loff_t			offset,
														
 
															+	ssize_t			size,
														
 
															+	void			*private)
														
 
															+{
														
 
															+	struct inode		*inode = file_inode(iocb->ki_filp);
														
 
															+	struct xfs_ioend	*ioend = private;
														
 
															+
														
 
															+	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
														
 
															+				     ioend ? ioend->io_type : 0, NULL);
														
 
															+
														
 
															+	if (!ioend) {
														
 
															+		ASSERT(offset + size <= i_size_read(inode));
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	__xfs_end_io_direct_write(inode, ioend, offset, size);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * For DAX we need a mapping buffer callback for unwritten extent conversion
														
 
															+ * when page faults allocate blocks and then zero them. Note that in this
														
 
															+ * case the mapping indicated by the ioend may extend beyond EOF. We most
														
 
															+ * definitely do not want to extend EOF here, so we trim back the ioend size to
														
 
															+ * EOF.
														
 
															+ */
														
 
															+#ifdef CONFIG_FS_DAX
														
 
															+void
														
 
															+xfs_end_io_dax_write(
														
 
															+	struct buffer_head	*bh,
														
 
															+	int			uptodate)
														
 
															+{
														
 
															+	struct xfs_ioend	*ioend = bh->b_private;
														
 
															+	struct inode		*inode = ioend->io_inode;
														
 
															+	ssize_t			size = ioend->io_size;
														
 
															+
														
 
															+	ASSERT(IS_DAX(ioend->io_inode));
														
 
															+
														
 
															+	/* if there was an error zeroing, then don't convert it */
														
 
															+	if (!uptodate)
														
 
															+		ioend->io_error = -EIO;
														
 
															+
														
 
															+	/*
														
 
															+	 * Trim update to EOF, so we don't extend EOF during unwritten extent
														
 
															+	 * conversion of partial EOF blocks.
														
 
															+	 */
														
 
															+	spin_lock(&XFS_I(inode)->i_flags_lock);
														
 
															+	if (ioend->io_offset + size > i_size_read(inode))
														
 
															+		size = i_size_read(inode) - ioend->io_offset;
														
 
															+	spin_unlock(&XFS_I(inode)->i_flags_lock);
														
 
															+
														
 
															+	__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
														
 
															+
														
 
															+}
														
 
															+#else
														
 
															+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
														
 
															+#endif
														
 
															+
														
 
															+static inline ssize_t
														
 
															+xfs_vm_do_dio(
														
 
															+	struct inode		*inode,
														
 
															+	struct kiocb		*iocb,
														
 
															+	struct iov_iter		*iter,
														
 
															+	loff_t			offset,
														
 
															+	void			(*endio)(struct kiocb	*iocb,
														
 
															+					 loff_t		offset,
														
 
															+					 ssize_t	size,
														
 
															+					 void		*private),
														
 
															+	int			flags)
														
 
															+{
														
 
															+	struct block_device	*bdev;
														
 
															+
														
 
															+	if (IS_DAX(inode))
														
 
															+		return dax_do_io(iocb, inode, iter, offset,
														
 
															+				 xfs_get_blocks_direct, endio, 0);
														
 
															+
														
 
															+	bdev = xfs_find_bdev_for_inode(inode);
														
 
															+	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
														
 
															+				     xfs_get_blocks_direct, endio, NULL, flags);
														
 
															+}
														
 
															+
														
 
															 STATIC ssize_t
														
 
															 xfs_vm_direct_IO(
														
 
															 	struct kiocb		*iocb,
														
@@ -1614,16 +1687,11 @@ xfs_vm_direct_IO(
 
															 	loff_t			offset)
														
 
															 {
														
 
															 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
														
 
															-	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
														
 
															-	if (iov_iter_rw(iter) == WRITE) {
														
 
															-		return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
														
 
															-					    xfs_get_blocks_direct,
														
 
															-					    xfs_end_io_direct_write, NULL,
														
 
															-					    DIO_ASYNC_EXTEND);
														
 
															-	}
														
 
															-	return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
														
 
															-				    xfs_get_blocks_direct, NULL, NULL, 0);
														
 
															+	if (iov_iter_rw(iter) == WRITE)
														
 
															+		return xfs_vm_do_dio(inode, iocb, iter, offset,
														
 
															+				     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
														
 
															+	return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
														
 
															 }
														
 
															 /*
														
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
 
															 } xfs_ioend_t;
														
 
															 extern const struct address_space_operations xfs_address_space_operations;
														
 
															-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
														
 
															+
														
 
															+int	xfs_get_blocks(struct inode *inode, sector_t offset,
														
 
															+		       struct buffer_head *map_bh, int create);
														
 
															+int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
														
 
															+			      struct buffer_head *map_bh, int create);
														
 
															+void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
														
 
															 extern void xfs_count_page_state(struct page *, int *, int *);
														
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1133,14 +1133,29 @@ xfs_zero_remaining_bytes(
 
															 			break;
														
 
															 		ASSERT(imap.br_blockcount >= 1);
														
 
															 		ASSERT(imap.br_startoff == offset_fsb);
														
 
															+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
														
 
															+
														
 
															+		if (imap.br_startblock == HOLESTARTBLOCK ||
														
 
															+		    imap.br_state == XFS_EXT_UNWRITTEN) {
														
 
															+			/* skip the entire extent */
														
 
															+			lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
														
 
															+						      imap.br_blockcount) - 1;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															 		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
														
 
															 		if (lastoffset > endoff)
														
 
															 			lastoffset = endoff;
														
 
															-		if (imap.br_startblock == HOLESTARTBLOCK)
														
 
															-			continue;
														
 
															-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
														
 
															-		if (imap.br_state == XFS_EXT_UNWRITTEN)
														
 
															+
														
 
															+		/* DAX can just zero the backing device directly */
														
 
															+		if (IS_DAX(VFS_I(ip))) {
														
 
															+			error = dax_zero_page_range(VFS_I(ip), offset,
														
 
															+						    lastoffset - offset + 1,
														
 
															+						    xfs_get_blocks_direct);
														
 
															+			if (error)
														
 
															+				return error;
														
 
															 			continue;
														
 
															+		}
														
 
															 		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
														
 
															 				mp->m_rtdev_targp : mp->m_ddev_targp,
														
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -79,14 +79,15 @@ xfs_rw_ilock_demote(
 
															 }
														
 
															 /*
														
 
															- *	xfs_iozero
														
 
															+ * xfs_iozero clears the specified range supplied via the page cache (except in
														
 
															+ * the DAX case). Writes through the page cache will allocate blocks over holes,
														
 
															+ * though the callers usually map the holes first and avoid them. If a block is
														
 
															+ * not completely zeroed, then it will be read from disk before being partially
														
 
															+ * zeroed.
														
 
															  *
														
 
															- *	xfs_iozero clears the specified range of buffer supplied,
														
 
															- *	and marks all the affected blocks as valid and modified.  If
														
 
															- *	an affected block is not allocated, it will be allocated.  If
														
 
															- *	an affected block is not completely overwritten, and is not
														
 
															- *	valid before the operation, it will be read from disk before
														
 
															- *	being partially zeroed.
														
 
															+ * In the DAX case, we can just directly write to the underlying pages. This
														
 
															+ * will not allocate blocks, but will avoid holes and unwritten extents and so
														
 
															+ * not do unnecessary work.
														
 
															  */
														
 
															 int
														
 
															 xfs_iozero(
														
@@ -96,7 +97,8 @@ xfs_iozero(
 
															 {
														
 
															 	struct page		*page;
														
 
															 	struct address_space	*mapping;
														
 
															-	int			status;
														
 
															+	int			status = 0;
														
 
															+
														
 
															 	mapping = VFS_I(ip)->i_mapping;
														
 
															 	do {
														
@@ -108,20 +110,27 @@ xfs_iozero(
 
															 		if (bytes > count)
														
 
															 			bytes = count;
														
 
															-		status = pagecache_write_begin(NULL, mapping, pos, bytes,
														
 
															-					AOP_FLAG_UNINTERRUPTIBLE,
														
 
															-					&page, &fsdata);
														
 
															-		if (status)
														
 
															-			break;
														
 
															+		if (IS_DAX(VFS_I(ip))) {
														
 
															+			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
														
 
															+						     xfs_get_blocks_direct);
														
 
															+			if (status)
														
 
															+				break;
														
 
															+		} else {
														
 
															+			status = pagecache_write_begin(NULL, mapping, pos, bytes,
														
 
															+						AOP_FLAG_UNINTERRUPTIBLE,
														
 
															+						&page, &fsdata);
														
 
															+			if (status)
														
 
															+				break;
														
 
															-		zero_user(page, offset, bytes);
														
 
															+			zero_user(page, offset, bytes);
														
 
															-		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
														
 
															-					page, fsdata);
														
 
															-		WARN_ON(status <= 0); /* can't return less than zero! */
														
 
															+			status = pagecache_write_end(NULL, mapping, pos, bytes,
														
 
															+						bytes, page, fsdata);
														
 
															+			WARN_ON(status <= 0); /* can't return less than zero! */
														
 
															+			status = 0;
														
 
															+		}
														
 
															 		pos += bytes;
														
 
															 		count -= bytes;
														
 
															-		status = 0;
														
 
															 	} while (count);
														
 
															 	return status;
														
@@ -284,7 +293,7 @@ xfs_file_read_iter(
 
															 	if (file->f_mode & FMODE_NOCMTIME)
														
 
															 		ioflags |= XFS_IO_INVIS;
														
 
															-	if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
														
 
															+	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
														
 
															 		xfs_buftarg_t	*target =
														
 
															 			XFS_IS_REALTIME_INODE(ip) ?
														
 
															 				mp->m_rtdev_targp : mp->m_ddev_targp;
														
@@ -378,7 +387,11 @@ xfs_file_splice_read(
 
															 	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
														
 
															-	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
														
 
															+	/* for dax, we need to avoid the page cache */
														
 
															+	if (IS_DAX(VFS_I(ip)))
														
 
															+		ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
														
 
															+	else
														
 
															+		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
														
 
															 	if (ret > 0)
														
 
															 		XFS_STATS_ADD(xs_read_bytes, ret);
														
@@ -672,7 +685,7 @@ xfs_file_dio_aio_write(
 
															 					mp->m_rtdev_targp : mp->m_ddev_targp;
														
 
															 	/* DIO must be aligned to device logical sector size */
														
 
															-	if ((pos | count) & target->bt_logical_sectormask)
														
 
															+	if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
														
 
															 		return -EINVAL;
														
 
															 	/* "unaligned" here means not aligned to a filesystem block */
														
@@ -758,8 +771,11 @@ xfs_file_dio_aio_write(
 
															 out:
														
 
															 	xfs_rw_iunlock(ip, iolock);
														
 
															-	/* No fallback to buffered IO on errors for XFS. */
														
 
															-	ASSERT(ret < 0 || ret == count);
														
 
															+	/*
														
 
															+	 * No fallback to buffered IO on errors for XFS. DAX can result in
														
 
															+	 * partial writes, but direct IO will either complete fully or fail.
														
 
															+	 */
														
 
															+	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
														
 
															 	return ret;
														
 
															 }
														
@@ -842,7 +858,7 @@ xfs_file_write_iter(
 
															 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
														
 
															 		return -EIO;
														
 
															-	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
														
 
															+	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
														
 
															 		ret = xfs_file_dio_aio_write(iocb, from);
														
 
															 	else
														
 
															 		ret = xfs_file_buffered_aio_write(iocb, from);
														
@@ -1063,17 +1079,6 @@ xfs_file_readdir(
 
															 	return xfs_readdir(ip, ctx, bufsize);
														
 
															 }
														
 
															-STATIC int
														
 
															-xfs_file_mmap(
														
 
															-	struct file	*filp,
														
 
															-	struct vm_area_struct *vma)
														
 
															-{
														
 
															-	vma->vm_ops = &xfs_file_vm_ops;
														
 
															-
														
 
															-	file_accessed(filp);
														
 
															-	return 0;
														
 
															-}
														
 
															-
														
 
															 /*
														
 
															  * This type is designed to indicate the type of offset we would like
														
 
															  * to search from page cache for xfs_seek_hole_data().
														
@@ -1454,48 +1459,83 @@ xfs_file_llseek(
 
															  * ordering of:
														
 
															  *
														
 
															  * mmap_sem (MM)
														
 
															- *   i_mmap_lock (XFS - truncate serialisation)
														
 
															- *     page_lock (MM)
														
 
															- *       i_lock (XFS - extent map serialisation)
														
 
															+ *   sb_start_pagefault(vfs, freeze)
														
 
															+ *     i_mmap_lock (XFS - truncate serialisation)
														
 
															+ *       page_lock (MM)
														
 
															+ *         i_lock (XFS - extent map serialisation)
														
 
															+ */
														
 
															+
														
 
															+/*
														
 
															+ * mmap()d file has taken write protection fault and is being made writable. We
														
 
															+ * can set the page state up correctly for a writable page, which means we can
														
 
															+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
														
 
															+ * mapping.
														
 
															  */
														
 
															 STATIC int
														
 
															-xfs_filemap_fault(
														
 
															+xfs_filemap_page_mkwrite(
														
 
															 	struct vm_area_struct	*vma,
														
 
															 	struct vm_fault		*vmf)
														
 
															 {
														
 
															-	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
														
 
															-	int			error;
														
 
															+	struct inode		*inode = file_inode(vma->vm_file);
														
 
															+	int			ret;
														
 
															-	trace_xfs_filemap_fault(ip);
														
 
															+	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
														
 
															-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
														
 
															-	error = filemap_fault(vma, vmf);
														
 
															-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
														
 
															+	sb_start_pagefault(inode->i_sb);
														
 
															+	file_update_time(vma->vm_file);
														
 
															+	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
														
 
															-	return error;
														
 
															+	if (IS_DAX(inode)) {
														
 
															+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
														
 
															+				    xfs_end_io_dax_write);
														
 
															+	} else {
														
 
															+		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
														
 
															+		ret = block_page_mkwrite_return(ret);
														
 
															+	}
														
 
															+
														
 
															+	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
														
 
															+	sb_end_pagefault(inode->i_sb);
														
 
															+
														
 
															+	return ret;
														
 
															 }
														
 
															-/*
														
 
															- * mmap()d file has taken write protection fault and is being made writable. We
														
 
															- * can set the page state up correctly for a writable page, which means we can
														
 
															- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
														
 
															- * mapping.
														
 
															- */
														
 
															 STATIC int
														
 
															-xfs_filemap_page_mkwrite(
														
 
															+xfs_filemap_fault(
														
 
															 	struct vm_area_struct	*vma,
														
 
															 	struct vm_fault		*vmf)
														
 
															 {
														
 
															-	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
														
 
															-	int			error;
														
 
															+	struct xfs_inode	*ip = XFS_I(file_inode(vma->vm_file));
														
 
															+	int			ret;
														
 
															+
														
 
															+	trace_xfs_filemap_fault(ip);
														
 
															-	trace_xfs_filemap_page_mkwrite(ip);
														
 
															+	/* DAX can shortcut the normal fault path on write faults! */
														
 
															+	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
														
 
															+		return xfs_filemap_page_mkwrite(vma, vmf);
														
 
															 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
														
 
															-	error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
														
 
															+	ret = filemap_fault(vma, vmf);
														
 
															 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
														
 
															-	return error;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static const struct vm_operations_struct xfs_file_vm_ops = {
														
 
															+	.fault		= xfs_filemap_fault,
														
 
															+	.map_pages	= filemap_map_pages,
														
 
															+	.page_mkwrite	= xfs_filemap_page_mkwrite,
														
 
															+};
														
 
															+
														
 
															+STATIC int
														
 
															+xfs_file_mmap(
														
 
															+	struct file	*filp,
														
 
															+	struct vm_area_struct *vma)
														
 
															+{
														
 
															+	file_accessed(filp);
														
 
															+	vma->vm_ops = &xfs_file_vm_ops;
														
 
															+	if (IS_DAX(file_inode(filp)))
														
 
															+		vma->vm_flags |= VM_MIXEDMAP;
														
 
															+	return 0;
														
 
															 }
														
 
															 const struct file_operations xfs_file_operations = {
														
@@ -1526,9 +1566,3 @@ const struct file_operations xfs_dir_file_operations = {
 
															 #endif
														
 
															 	.fsync		= xfs_dir_fsync,
														
 
															 };
														
 
															-
														
 
															-static const struct vm_operations_struct xfs_file_vm_ops = {
														
 
															-	.fault		= xfs_filemap_fault,
														
 
															-	.map_pages	= filemap_map_pages,
														
 
															-	.page_mkwrite	= xfs_filemap_page_mkwrite,
														
 
															-};
														
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -851,7 +851,11 @@ xfs_setattr_size(
 
															 	 * to hope that the caller sees ENOMEM and retries the truncate
														
 
															 	 * operation.
														
 
															 	 */
														
 
															-	error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
														
 
															+	if (IS_DAX(inode))
														
 
															+		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
														
 
															+	else
														
 
															+		error = block_truncate_page(inode->i_mapping, newsize,
														
 
															+					    xfs_get_blocks);
														
 
															 	if (error)
														
 
															 		return error;
														
 
															 	truncate_setsize(inode, newsize);
														
@@ -1191,22 +1195,22 @@ xfs_diflags_to_iflags(
 
															 	struct inode		*inode,
														
 
															 	struct xfs_inode	*ip)
														
 
															 {
														
 
															-	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
														
 
															+	uint16_t		flags = ip->i_d.di_flags;
														
 
															+
														
 
															+	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
														
 
															+			    S_NOATIME | S_DAX);
														
 
															+
														
 
															+	if (flags & XFS_DIFLAG_IMMUTABLE)
														
 
															 		inode->i_flags |= S_IMMUTABLE;
														
 
															-	else
														
 
															-		inode->i_flags &= ~S_IMMUTABLE;
														
 
															-	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
														
 
															+	if (flags & XFS_DIFLAG_APPEND)
														
 
															 		inode->i_flags |= S_APPEND;
														
 
															-	else
														
 
															-		inode->i_flags &= ~S_APPEND;
														
 
															-	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
														
 
															+	if (flags & XFS_DIFLAG_SYNC)
														
 
															 		inode->i_flags |= S_SYNC;
														
 
															-	else
														
 
															-		inode->i_flags &= ~S_SYNC;
														
 
															-	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
														
 
															+	if (flags & XFS_DIFLAG_NOATIME)
														
 
															 		inode->i_flags |= S_NOATIME;
														
 
															-	else
														
 
															-		inode->i_flags &= ~S_NOATIME;
														
 
															+	/* XXX: Also needs an on-disk per inode flag! */
														
 
															+	if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
														
 
															+		inode->i_flags |= S_DAX;
														
 
															 }
														
 
															 /*
														
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -181,6 +181,8 @@ typedef struct xfs_mount {
 
															 						   allocator */
														
 
															 #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
														
 
															+#define XFS_MOUNT_DAX		(1ULL << 62)	/* TEST ONLY! */
														
 
															+
														
 
															 /*
														
 
															  * Default minimum read and write sizes.
														
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 
															 #define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
														
 
															 #define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
														
 
															+#define MNTOPT_DAX	"dax"		/* Enable direct access to bdev pages */
														
 
															+
														
 
															 /*
														
 
															  * Table driven mount option parser.
														
 
															  *
														
@@ -363,6 +365,10 @@ xfs_parseargs(
 
															 			mp->m_flags |= XFS_MOUNT_DISCARD;
														
 
															 		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
														
 
															 			mp->m_flags &= ~XFS_MOUNT_DISCARD;
														
 
															+#ifdef CONFIG_FS_DAX
														
 
															+		} else if (!strcmp(this_char, MNTOPT_DAX)) {
														
 
															+			mp->m_flags |= XFS_MOUNT_DAX;
														
 
															+#endif
														
 
															 		} else {
														
 
															 			xfs_warn(mp, "unknown mount option [%s].", this_char);
														
 
															 			return -EINVAL;
														
@@ -452,8 +458,8 @@ done:
 
															 }
														
 
															 struct proc_xfs_info {
														
 
															-	int	flag;
														
 
															-	char	*str;
														
 
															+	uint64_t	flag;
														
 
															+	char		*str;
														
 
															 };
														
 
															 STATIC int
														
@@ -474,6 +480,7 @@ xfs_showargs(
 
															 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
														
 
															 		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
														
 
															 		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_32BITINODE },
														
 
															+		{ XFS_MOUNT_DAX,		"," MNTOPT_DAX },
														
 
															 		{ 0, NULL }
														
 
															 	};
														
 
															 	static struct proc_xfs_info xfs_info_unset[] = {
														
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
 
															 	if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
														
 
															 		sb->s_flags |= MS_I_VERSION;
														
 
															+	if (mp->m_flags & XFS_MOUNT_DAX) {
														
 
															+		xfs_warn(mp,
														
 
															+	"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
														
 
															+		if (sb->s_blocksize != PAGE_SIZE) {
														
 
															+			xfs_alert(mp,
														
 
															+		"Filesystem block size invalid for DAX Turning DAX off.");
														
 
															+			mp->m_flags &= ~XFS_MOUNT_DAX;
														
 
															+		} else if (!sb->s_bdev->bd_disk->fops->direct_access) {
														
 
															+			xfs_alert(mp,
														
 
															+		"Block device does not support DAX Turning DAX off.");
														
 
															+			mp->m_flags &= ~XFS_MOUNT_DAX;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 	error = xfs_mountfs(mp);
														
 
															 	if (error)
														
 
															 		goto out_filestream_unmount;
														
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 
															 			struct buffer_head *bh_result, int create);
														
 
															 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
														
 
															 			ssize_t bytes, void *private);
														
 
															+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
														
 
															 #define MAY_EXEC		0x00000001
														
 
															 #define MAY_WRITE		0x00000002
														
@@ -2627,9 +2628,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 
															 int dax_clear_blocks(struct inode *, sector_t block, long size);
														
 
															 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
														
 
															 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
														
 
															-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
														
 
															+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
														
 
															+		dax_iodone_t);
														
 
															+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
														
 
															+		dax_iodone_t);
														
 
															 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
														
 
															-#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
														
 
															+#define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
														
 
															+#define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
														
 
															 #ifdef CONFIG_BLOCK
														
 
															 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,