9 лет назад · a1f45e668e
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
 
				 #include <linux/vmstat.h>
			
 
				 #include <linux/pfn_t.h>
			
 
				 #include <linux/sizes.h>
			
 
				+#include <linux/iomap.h>
			
 
				+#include "internal.h"
			
 
				 
			
 
				 /*
			
 
				  * We use lowest available bit in exceptional entry for locking, other two
			
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 
				 	return VM_FAULT_LOCKED;
			
 
				 }
			
 
				 
			
 
				-static int copy_user_bh(struct page *to, struct inode *inode,
			
 
				-		struct buffer_head *bh, unsigned long vaddr)
			
 
				+static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
			
 
				+		struct page *to, unsigned long vaddr)
			
 
				 {
			
 
				 	struct blk_dax_ctl dax = {
			
 
				-		.sector = to_sector(bh, inode),
			
 
				-		.size = bh->b_size,
			
 
				+		.sector = sector,
			
 
				+		.size = size,
			
 
				 	};
			
 
				-	struct block_device *bdev = bh->b_bdev;
			
 
				 	void *vto;
			
 
				 
			
 
				 	if (dax_map_atomic(bdev, &dax) < 0)
			
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
				 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
			
 
				 
			
 
				 static int dax_insert_mapping(struct address_space *mapping,
			
 
				-			struct buffer_head *bh, void **entryp,
			
 
				-			struct vm_area_struct *vma, struct vm_fault *vmf)
			
 
				+		struct block_device *bdev, sector_t sector, size_t size,
			
 
				+		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
			
 
				 {
			
 
				 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
			
 
				-	struct block_device *bdev = bh->b_bdev;
			
 
				 	struct blk_dax_ctl dax = {
			
 
				-		.sector = to_sector(bh, mapping->host),
			
 
				-		.size = bh->b_size,
			
 
				+		.sector = sector,
			
 
				+		.size = size,
			
 
				 	};
			
 
				 	void *ret;
			
 
				 	void *entry = *entryp;
			
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 	if (vmf->cow_page) {
			
 
				 		struct page *new_page = vmf->cow_page;
			
 
				 		if (buffer_written(&bh))
			
 
				-			error = copy_user_bh(new_page, inode, &bh, vaddr);
			
 
				+			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
			
 
				+					bh.b_size, new_page, vaddr);
			
 
				 		else
			
 
				 			clear_user_highpage(new_page, vaddr);
			
 
				 		if (error)
			
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 
			
 
				 	/* Filesystem should not return unwritten buffers to us! */
			
 
				 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
			
 
				-	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
			
 
				+	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
			
 
				+			bh.b_size, &entry, vma, vmf);
			
 
				  unlock_entry:
			
 
				 	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
			
 
				  out:
			
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 
				 	return dax_zero_page_range(inode, from, length, get_block);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dax_truncate_page);
			
 
				+
			
 
				+#ifdef CONFIG_FS_IOMAP
			
 
				+static loff_t
			
 
				+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
			
 
				+		struct iomap *iomap)
			
 
				+{
			
 
				+	struct iov_iter *iter = data;
			
 
				+	loff_t end = pos + length, done = 0;
			
 
				+	ssize_t ret = 0;
			
 
				+
			
 
				+	if (iov_iter_rw(iter) == READ) {
			
 
				+		end = min(end, i_size_read(inode));
			
 
				+		if (pos >= end)
			
 
				+			return 0;
			
 
				+
			
 
				+		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
			
 
				+			return iov_iter_zero(min(length, end - pos), iter);
			
 
				+	}
			
 
				+
			
 
				+	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	while (pos < end) {
			
 
				+		unsigned offset = pos & (PAGE_SIZE - 1);
			
 
				+		struct blk_dax_ctl dax = { 0 };
			
 
				+		ssize_t map_len;
			
 
				+
			
 
				+		dax.sector = iomap->blkno +
			
 
				+			(((pos & PAGE_MASK) - iomap->offset) >> 9);
			
 
				+		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
			
 
				+		map_len = dax_map_atomic(iomap->bdev, &dax);
			
 
				+		if (map_len < 0) {
			
 
				+			ret = map_len;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		dax.addr += offset;
			
 
				+		map_len -= offset;
			
 
				+		if (map_len > end - pos)
			
 
				+			map_len = end - pos;
			
 
				+
			
 
				+		if (iov_iter_rw(iter) == WRITE)
			
 
				+			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
			
 
				+		else
			
 
				+			map_len = copy_to_iter(dax.addr, map_len, iter);
			
 
				+		dax_unmap_atomic(iomap->bdev, &dax);
			
 
				+		if (map_len <= 0) {
			
 
				+			ret = map_len ? map_len : -EFAULT;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		pos += map_len;
			
 
				+		length -= map_len;
			
 
				+		done += map_len;
			
 
				+	}
			
 
				+
			
 
				+	return done ? done : ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * iomap_dax_rw - Perform I/O to a DAX file
			
 
				+ * @iocb:	The control block for this I/O
			
 
				+ * @iter:	The addresses to do I/O from or to
			
 
				+ * @ops:	iomap ops passed from the file system
			
 
				+ *
			
 
				+ * This function performs read and write operations to directly mapped
			
 
				+ * persistent memory.  The callers needs to take care of read/write exclusion
			
 
				+ * and evicting any page cache pages in the region under I/O.
			
 
				+ */
			
 
				+ssize_t
			
 
				+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
			
 
				+		struct iomap_ops *ops)
			
 
				+{
			
 
				+	struct address_space *mapping = iocb->ki_filp->f_mapping;
			
 
				+	struct inode *inode = mapping->host;
			
 
				+	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
			
 
				+	unsigned flags = 0;
			
 
				+
			
 
				+	if (iov_iter_rw(iter) == WRITE)
			
 
				+		flags |= IOMAP_WRITE;
			
 
				+
			
 
				+	/*
			
 
				+	 * Yes, even DAX files can have page cache attached to them:  A zeroed
			
 
				+	 * page is inserted into the pagecache when we have to serve a write
			
 
				+	 * fault on a hole.  It should never be dirtied and can simply be
			
 
				+	 * dropped from the pagecache once we get real data for the page.
			
 
				+	 *
			
 
				+	 * XXX: This is racy against mmap, and there's nothing we can do about
			
 
				+	 * it. We'll eventually need to shift this down even further so that
			
 
				+	 * we can check if we allocated blocks over a hole first.
			
 
				+	 */
			
 
				+	if (mapping->nrpages) {
			
 
				+		ret = invalidate_inode_pages2_range(mapping,
			
 
				+				pos >> PAGE_SHIFT,
			
 
				+				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
			
 
				+		WARN_ON_ONCE(ret);
			
 
				+	}
			
 
				+
			
 
				+	while (iov_iter_count(iter)) {
			
 
				+		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
			
 
				+				iter, iomap_dax_actor);
			
 
				+		if (ret <= 0)
			
 
				+			break;
			
 
				+		pos += ret;
			
 
				+		done += ret;
			
 
				+	}
			
 
				+
			
 
				+	iocb->ki_pos += done;
			
 
				+	return done ? done : ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_dax_rw);
			
 
				+
			
 
				+/**
			
 
				+ * iomap_dax_fault - handle a page fault on a DAX file
			
 
				+ * @vma: The virtual memory area where the fault occurred
			
 
				+ * @vmf: The description of the fault
			
 
				+ * @ops: iomap ops passed from the file system
			
 
				+ *
			
 
				+ * When a page fault occurs, filesystems may call this helper in their fault
			
 
				+ * or mkwrite handler for DAX files. Assumes the caller has done all the
			
 
				+ * necessary locking for the page fault to proceed successfully.
			
 
				+ */
			
 
				+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			
 
				+			struct iomap_ops *ops)
			
 
				+{
			
 
				+	struct address_space *mapping = vma->vm_file->f_mapping;
			
 
				+	struct inode *inode = mapping->host;
			
 
				+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
			
 
				+	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
			
 
				+	sector_t sector;
			
 
				+	struct iomap iomap = { 0 };
			
 
				+	unsigned flags = 0;
			
 
				+	int error, major = 0;
			
 
				+	void *entry;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check whether offset isn't beyond end of file now. Caller is supposed
			
 
				+	 * to hold locks serializing us with truncate / punch hole so this is
			
 
				+	 * a reliable test.
			
 
				+	 */
			
 
				+	if (pos >= i_size_read(inode))
			
 
				+		return VM_FAULT_SIGBUS;
			
 
				+
			
 
				+	entry = grab_mapping_entry(mapping, vmf->pgoff);
			
 
				+	if (IS_ERR(entry)) {
			
 
				+		error = PTR_ERR(entry);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
			
 
				+		flags |= IOMAP_WRITE;
			
 
				+
			
 
				+	/*
			
 
				+	 * Note that we don't bother to use iomap_apply here: DAX required
			
 
				+	 * the file system block size to be equal the page size, which means
			
 
				+	 * that we never have to deal with more than a single extent here.
			
 
				+	 */
			
 
				+	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
			
 
				+	if (error)
			
 
				+		goto unlock_entry;
			
 
				+	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
			
 
				+		error = -EIO;		/* fs corruption? */
			
 
				+		goto unlock_entry;
			
 
				+	}
			
 
				+
			
 
				+	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
			
 
				+
			
 
				+	if (vmf->cow_page) {
			
 
				+		switch (iomap.type) {
			
 
				+		case IOMAP_HOLE:
			
 
				+		case IOMAP_UNWRITTEN:
			
 
				+			clear_user_highpage(vmf->cow_page, vaddr);
			
 
				+			break;
			
 
				+		case IOMAP_MAPPED:
			
 
				+			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
			
 
				+					vmf->cow_page, vaddr);
			
 
				+			break;
			
 
				+		default:
			
 
				+			WARN_ON_ONCE(1);
			
 
				+			error = -EIO;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (error)
			
 
				+			goto unlock_entry;
			
 
				+		if (!radix_tree_exceptional_entry(entry)) {
			
 
				+			vmf->page = entry;
			
 
				+			return VM_FAULT_LOCKED;
			
 
				+		}
			
 
				+		vmf->entry = entry;
			
 
				+		return VM_FAULT_DAX_LOCKED;
			
 
				+	}
			
 
				+
			
 
				+	switch (iomap.type) {
			
 
				+	case IOMAP_MAPPED:
			
 
				+		if (iomap.flags & IOMAP_F_NEW) {
			
 
				+			count_vm_event(PGMAJFAULT);
			
 
				+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			
 
				+			major = VM_FAULT_MAJOR;
			
 
				+		}
			
 
				+		error = dax_insert_mapping(mapping, iomap.bdev, sector,
			
 
				+				PAGE_SIZE, &entry, vma, vmf);
			
 
				+		break;
			
 
				+	case IOMAP_UNWRITTEN:
			
 
				+	case IOMAP_HOLE:
			
 
				+		if (!(vmf->flags & FAULT_FLAG_WRITE))
			
 
				+			return dax_load_hole(mapping, entry, vmf);
			
 
				+		/*FALLTHRU*/
			
 
				+	default:
			
 
				+		WARN_ON_ONCE(1);
			
 
				+		error = -EIO;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+ unlock_entry:
			
 
				+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
			
 
				+ out:
			
 
				+	if (error == -ENOMEM)
			
 
				+		return VM_FAULT_OOM | major;
			
 
				+	/* -EBUSY is fine, somebody else faulted on the same PTE */
			
 
				+	if (error < 0 && error != -EBUSY)
			
 
				+		return VM_FAULT_SIGBUS | major;
			
 
				+	return VM_FAULT_NOPAGE | major;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(iomap_dax_fault);
			
 
				+#endif /* CONFIG_FS_IOMAP */
			
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
 
				 config EXT2_FS
			
 
				 	tristate "Second extended fs support"
			
 
				+	select FS_IOMAP if FS_DAX
			
 
				 	help
			
 
				 	  Ext2 is a standard Linux file system for hard disks.
			
 
				 
			
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
 
				 /* inode.c */
			
 
				 extern const struct address_space_operations ext2_aops;
			
 
				 extern const struct address_space_operations ext2_nobh_aops;
			
 
				+extern struct iomap_ops ext2_iomap_ops;
			
 
				 
			
 
				 /* namei.c */
			
 
				 extern const struct inode_operations ext2_dir_inode_operations;
			
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
 
				 #include <linux/pagemap.h>
			
 
				 #include <linux/dax.h>
			
 
				 #include <linux/quotaops.h>
			
 
				+#include <linux/iomap.h>
			
 
				+#include <linux/uio.h>
			
 
				 #include "ext2.h"
			
 
				 #include "xattr.h"
			
 
				 #include "acl.h"
			
 
				 
			
 
				 #ifdef CONFIG_FS_DAX
			
 
				+static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
			
 
				+{
			
 
				+	struct inode *inode = iocb->ki_filp->f_mapping->host;
			
 
				+	ssize_t ret;
			
 
				+
			
 
				+	if (!iov_iter_count(to))
			
 
				+		return 0; /* skip atime */
			
 
				+
			
 
				+	inode_lock_shared(inode);
			
 
				+	ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
			
 
				+	inode_unlock_shared(inode);
			
 
				+
			
 
				+	file_accessed(iocb->ki_filp);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
			
 
				+{
			
 
				+	struct file *file = iocb->ki_filp;
			
 
				+	struct inode *inode = file->f_mapping->host;
			
 
				+	ssize_t ret;
			
 
				+
			
 
				+	inode_lock(inode);
			
 
				+	ret = generic_write_checks(iocb, from);
			
 
				+	if (ret <= 0)
			
 
				+		goto out_unlock;
			
 
				+	ret = file_remove_privs(file);
			
 
				+	if (ret)
			
 
				+		goto out_unlock;
			
 
				+	ret = file_update_time(file);
			
 
				+	if (ret)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
			
 
				+	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
			
 
				+		i_size_write(inode, iocb->ki_pos);
			
 
				+		mark_inode_dirty(inode);
			
 
				+	}
			
 
				+
			
 
				+out_unlock:
			
 
				+	inode_unlock(inode);
			
 
				+	if (ret > 0)
			
 
				+		ret = generic_write_sync(iocb, ret);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * The lock ordering for ext2 DAX fault paths is:
			
 
				  *
			
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	}
			
 
				 	down_read(&ei->dax_sem);
			
 
				 
			
 
				-	ret = dax_fault(vma, vmf, ext2_get_block);
			
 
				+	ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
			
 
				 
			
 
				 	up_read(&ei->dax_sem);
			
 
				 	if (vmf->flags & FAULT_FLAG_WRITE)
			
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * We have mostly NULL's here: the current defaults are ok for
			
 
				- * the ext2 filesystem.
			
 
				- */
			
 
				+static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
			
 
				+{
			
 
				+#ifdef CONFIG_FS_DAX
			
 
				+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
			
 
				+		return ext2_dax_read_iter(iocb, to);
			
 
				+#endif
			
 
				+	return generic_file_read_iter(iocb, to);
			
 
				+}
			
 
				+
			
 
				+static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
			
 
				+{
			
 
				+#ifdef CONFIG_FS_DAX
			
 
				+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
			
 
				+		return ext2_dax_write_iter(iocb, from);
			
 
				+#endif
			
 
				+	return generic_file_write_iter(iocb, from);
			
 
				+}
			
 
				+
			
 
				 const struct file_operations ext2_file_operations = {
			
 
				 	.llseek		= generic_file_llseek,
			
 
				-	.read_iter	= generic_file_read_iter,
			
 
				-	.write_iter	= generic_file_write_iter,
			
 
				+	.read_iter	= ext2_file_read_iter,
			
 
				+	.write_iter	= ext2_file_write_iter,
			
 
				 	.unlocked_ioctl = ext2_ioctl,
			
 
				 #ifdef CONFIG_COMPAT
			
 
				 	.compat_ioctl	= ext2_compat_ioctl,
			
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 
				 #include <linux/buffer_head.h>
			
 
				 #include <linux/mpage.h>
			
 
				 #include <linux/fiemap.h>
			
 
				+#include <linux/iomap.h>
			
 
				 #include <linux/namei.h>
			
 
				 #include <linux/uio.h>
			
 
				 #include "ext2.h"
			
@@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,
 
				  */
			
 
				 static int ext2_get_blocks(struct inode *inode,
			
 
				 			   sector_t iblock, unsigned long maxblocks,
			
 
				-			   struct buffer_head *bh_result,
			
 
				+			   u32 *bno, bool *new, bool *boundary,
			
 
				 			   int create)
			
 
				 {
			
 
				 	int err = -EIO;
			
@@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,
 
				 	/* Simplest case - block found, no allocation needed */
			
 
				 	if (!partial) {
			
 
				 		first_block = le32_to_cpu(chain[depth - 1].key);
			
 
				-		clear_buffer_new(bh_result); /* What's this do? */
			
 
				 		count++;
			
 
				 		/*map more blocks*/
			
 
				 		while (count < maxblocks && count <= blocks_to_boundary) {
			
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
 
				 			mutex_unlock(&ei->truncate_mutex);
			
 
				 			if (err)
			
 
				 				goto cleanup;
			
 
				-			clear_buffer_new(bh_result);
			
 
				 			goto got_it;
			
 
				 		}
			
 
				 	}
			
@@ -745,15 +744,16 @@ static int ext2_get_blocks(struct inode *inode,
 
				 			mutex_unlock(&ei->truncate_mutex);
			
 
				 			goto cleanup;
			
 
				 		}
			
 
				-	} else
			
 
				-		set_buffer_new(bh_result);
			
 
				+	} else {
			
 
				+		*new = true;
			
 
				+	}
			
 
				 
			
 
				 	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
			
 
				 	mutex_unlock(&ei->truncate_mutex);
			
 
				 got_it:
			
 
				-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
			
 
				+	*bno = le32_to_cpu(chain[depth-1].key);
			
 
				 	if (count > blocks_to_boundary)
			
 
				-		set_buffer_boundary(bh_result);
			
 
				+		*boundary = true;
			
 
				 	err = count;
			
 
				 	/* Clean up and exit */
			
 
				 	partial = chain + depth - 1;	/* the whole chain */
			
@@ -765,19 +765,82 @@ static int ext2_get_blocks(struct inode *inode,
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
			
 
				+int ext2_get_block(struct inode *inode, sector_t iblock,
			
 
				+		struct buffer_head *bh_result, int create)
			
 
				 {
			
 
				 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
			
 
				-	int ret = ext2_get_blocks(inode, iblock, max_blocks,
			
 
				-			      bh_result, create);
			
 
				-	if (ret > 0) {
			
 
				-		bh_result->b_size = (ret << inode->i_blkbits);
			
 
				-		ret = 0;
			
 
				+	bool new = false, boundary = false;
			
 
				+	u32 bno;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
			
 
				+			create);
			
 
				+	if (ret <= 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	map_bh(bh_result, inode->i_sb, bno);
			
 
				+	bh_result->b_size = (ret << inode->i_blkbits);
			
 
				+	if (new)
			
 
				+		set_buffer_new(bh_result);
			
 
				+	if (boundary)
			
 
				+		set_buffer_boundary(bh_result);
			
 
				+	return 0;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_FS_DAX
			
 
				+static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
			
 
				+		unsigned flags, struct iomap *iomap)
			
 
				+{
			
 
				+	unsigned int blkbits = inode->i_blkbits;
			
 
				+	unsigned long first_block = offset >> blkbits;
			
 
				+	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
			
 
				+	bool new = false, boundary = false;
			
 
				+	u32 bno;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = ext2_get_blocks(inode, first_block, max_blocks,
			
 
				+			&bno, &new, &boundary, flags & IOMAP_WRITE);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	iomap->flags = 0;
			
 
				+	iomap->bdev = inode->i_sb->s_bdev;
			
 
				+	iomap->offset = (u64)first_block << blkbits;
			
 
				+
			
 
				+	if (ret == 0) {
			
 
				+		iomap->type = IOMAP_HOLE;
			
 
				+		iomap->blkno = IOMAP_NULL_BLOCK;
			
 
				+		iomap->length = 1 << blkbits;
			
 
				+	} else {
			
 
				+		iomap->type = IOMAP_MAPPED;
			
 
				+		iomap->blkno = (sector_t)bno << (blkbits - 9);
			
 
				+		iomap->length = (u64)ret << blkbits;
			
 
				+		iomap->flags |= IOMAP_F_MERGED;
			
 
				 	}
			
 
				-	return ret;
			
 
				 
			
 
				+	if (new)
			
 
				+		iomap->flags |= IOMAP_F_NEW;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				+static int
			
 
				+ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
			
 
				+		ssize_t written, unsigned flags, struct iomap *iomap)
			
 
				+{
			
 
				+	if (iomap->type == IOMAP_MAPPED &&
			
 
				+	    written < length &&
			
 
				+	    (flags & IOMAP_WRITE))
			
 
				+		ext2_write_failed(inode->i_mapping, offset + length);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+struct iomap_ops ext2_iomap_ops = {
			
 
				+	.iomap_begin		= ext2_iomap_begin,
			
 
				+	.iomap_end		= ext2_iomap_end,
			
 
				+};
			
 
				+#endif /* CONFIG_FS_DAX */
			
 
				+
			
 
				 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
			
 
				 		u64 start, u64 len)
			
 
				 {
			
@@ -863,11 +926,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
				 	loff_t offset = iocb->ki_pos;
			
 
				 	ssize_t ret;
			
 
				 
			
 
				-	if (IS_DAX(inode))
			
 
				-		ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
			
 
				-				DIO_LOCKING);
			
 
				-	else
			
 
				-		ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
			
 
				+	if (WARN_ON_ONCE(IS_DAX(inode)))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
			
 
				 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
			
 
				 		ext2_write_failed(mapping, offset + count);
			
 
				 	return ret;
			
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 
				 struct super_block;
			
 
				 struct file_system_type;
			
 
				 struct iomap;
			
 
				+struct iomap_ops;
			
 
				 struct linux_binprm;
			
 
				 struct path;
			
 
				 struct mount;
			
@@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;
 
				 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
			
 
				 		    unsigned long arg);
			
 
				 extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
			
 
				+
			
 
				+/*
			
 
				+ * iomap support:
			
 
				+ */
			
 
				+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
			
 
				+		void *data, struct iomap *iomap);
			
 
				+
			
 
				+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
			
 
				+		unsigned flags, struct iomap_ops *ops, void *data,
			
 
				+		iomap_actor_t actor);
			
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
 
				 #include <linux/dax.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
			
 
				-		void *data, struct iomap *iomap);
			
 
				-
			
 
				 /*
			
 
				  * Execute a iomap write on a segment of the mapping that spans a
			
 
				  * contiguous range of pages that have identical block mapping state.
			
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 
				  * resources they require in the iomap_begin call, and release them in the
			
 
				  * iomap_end call.
			
 
				  */
			
 
				-static loff_t
			
 
				+loff_t
			
 
				 iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
			
 
				 		struct iomap_ops *ops, void *data, iomap_actor_t actor)
			
 
				 {
			
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(
 
				  * Update on-disk file size now that data has been written to disk.
			
 
				  */
			
 
				 STATIC int
			
 
				-xfs_setfilesize(
			
 
				+__xfs_setfilesize(
			
 
				 	struct xfs_inode	*ip,
			
 
				 	struct xfs_trans	*tp,
			
 
				 	xfs_off_t		offset,
			
@@ -225,6 +225,23 @@ xfs_setfilesize(
 
				 	return xfs_trans_commit(tp);
			
 
				 }
			
 
				 
			
 
				+int
			
 
				+xfs_setfilesize(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	xfs_off_t		offset,
			
 
				+	size_t			size)
			
 
				+{
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	struct xfs_trans	*tp;
			
 
				+	int			error;
			
 
				+
			
 
				+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
			
 
				+	if (error)
			
 
				+		return error;
			
 
				+
			
 
				+	return __xfs_setfilesize(ip, tp, offset, size);
			
 
				+}
			
 
				+
			
 
				 STATIC int
			
 
				 xfs_setfilesize_ioend(
			
 
				 	struct xfs_ioend	*ioend,
			
@@ -247,7 +264,7 @@ xfs_setfilesize_ioend(
 
				 		return error;
			
 
				 	}
			
 
				 
			
 
				-	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
			
 
				+	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(
 
				 {
			
 
				 	struct inode		*inode = file_inode(iocb->ki_filp);
			
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				 	uintptr_t		flags = (uintptr_t)private;
			
 
				 	int			error = 0;
			
 
				 
			
 
				 	trace_xfs_end_io_direct_write(ip, offset, size);
			
 
				 
			
 
				-	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
			
 
				 		return -EIO;
			
 
				 
			
 
				 	if (size <= 0)
			
@@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(
 
				 
			
 
				 		error = xfs_iomap_write_unwritten(ip, offset, size);
			
 
				 	} else if (flags & XFS_DIO_FLAG_APPEND) {
			
 
				-		struct xfs_trans *tp;
			
 
				-
			
 
				 		trace_xfs_end_io_direct_write_append(ip, offset, size);
			
 
				 
			
 
				-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
			
 
				-				&tp);
			
 
				-		if (!error)
			
 
				-			error = xfs_setfilesize(ip, tp, offset, size);
			
 
				+		error = xfs_setfilesize(ip, offset, size);
			
 
				 	}
			
 
				 
			
 
				 	return error;
			
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,6 +62,7 @@ int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
 
				 
			
 
				 int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
			
 
				 		ssize_t size, void *private);
			
 
				+int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
			
 
				 
			
 
				 extern void xfs_count_page_state(struct page *, int *, int *);
			
 
				 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
			
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -333,10 +333,7 @@ xfs_file_dax_read(
 
				 	struct kiocb		*iocb,
			
 
				 	struct iov_iter		*to)
			
 
				 {
			
 
				-	struct address_space	*mapping = iocb->ki_filp->f_mapping;
			
 
				-	struct inode		*inode = mapping->host;
			
 
				-	struct xfs_inode	*ip = XFS_I(inode);
			
 
				-	struct iov_iter		data = *to;
			
 
				+	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
			
 
				 	size_t			count = iov_iter_count(to);
			
 
				 	ssize_t			ret = 0;
			
 
				 
			
@@ -346,11 +343,7 @@ xfs_file_dax_read(
 
				 		return 0; /* skip atime */
			
 
				 
			
 
				 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
			
 
				-	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
			
 
				-	if (ret > 0) {
			
 
				-		iocb->ki_pos += ret;
			
 
				-		iov_iter_advance(to, ret);
			
 
				-	}
			
 
				+	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
			
 
				 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
			
 
				 
			
 
				 	file_accessed(iocb->ki_filp);
			
@@ -712,70 +705,32 @@ xfs_file_dax_write(
 
				 	struct kiocb		*iocb,
			
 
				 	struct iov_iter		*from)
			
 
				 {
			
 
				-	struct address_space	*mapping = iocb->ki_filp->f_mapping;
			
 
				-	struct inode		*inode = mapping->host;
			
 
				+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
			
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	ssize_t			ret = 0;
			
 
				-	int			unaligned_io = 0;
			
 
				-	int			iolock;
			
 
				-	struct iov_iter		data;
			
 
				+	int			iolock = XFS_IOLOCK_EXCL;
			
 
				+	ssize_t			ret, error = 0;
			
 
				+	size_t			count;
			
 
				+	loff_t			pos;
			
 
				 
			
 
				-	/* "unaligned" here means not aligned to a filesystem block */
			
 
				-	if ((iocb->ki_pos & mp->m_blockmask) ||
			
 
				-	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
			
 
				-		unaligned_io = 1;
			
 
				-		iolock = XFS_IOLOCK_EXCL;
			
 
				-	} else if (mapping->nrpages) {
			
 
				-		iolock = XFS_IOLOCK_EXCL;
			
 
				-	} else {
			
 
				-		iolock = XFS_IOLOCK_SHARED;
			
 
				-	}
			
 
				 	xfs_rw_ilock(ip, iolock);
			
 
				-
			
 
				 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
			
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 
			
 
				-	/*
			
 
				-	 * Yes, even DAX files can have page cache attached to them:  A zeroed
			
 
				-	 * page is inserted into the pagecache when we have to serve a write
			
 
				-	 * fault on a hole.  It should never be dirtied and can simply be
			
 
				-	 * dropped from the pagecache once we get real data for the page.
			
 
				-	 *
			
 
				-	 * XXX: This is racy against mmap, and there's nothing we can do about
			
 
				-	 * it. dax_do_io() should really do this invalidation internally as
			
 
				-	 * it will know if we've allocated over a holei for this specific IO and
			
 
				-	 * if so it needs to update the mapping tree and invalidate existing
			
 
				-	 * PTEs over the newly allocated range. Remove this invalidation when
			
 
				-	 * dax_do_io() is fixed up.
			
 
				-	 */
			
 
				-	if (mapping->nrpages) {
			
 
				-		loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
			
 
				+	pos = iocb->ki_pos;
			
 
				+	count = iov_iter_count(from);
			
 
				 
			
 
				-		ret = invalidate_inode_pages2_range(mapping,
			
 
				-						    iocb->ki_pos >> PAGE_SHIFT,
			
 
				-						    end >> PAGE_SHIFT);
			
 
				-		WARN_ON_ONCE(ret);
			
 
				-	}
			
 
				+	trace_xfs_file_dax_write(ip, count, pos);
			
 
				 
			
 
				-	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
			
 
				-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
			
 
				-		iolock = XFS_IOLOCK_SHARED;
			
 
				+	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
			
 
				+	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
			
 
				+		i_size_write(inode, iocb->ki_pos);
			
 
				+		error = xfs_setfilesize(ip, pos, ret);
			
 
				 	}
			
 
				 
			
 
				-	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
			
 
				-
			
 
				-	data = *from;
			
 
				-	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
			
 
				-			xfs_end_io_direct_write, 0);
			
 
				-	if (ret > 0) {
			
 
				-		iocb->ki_pos += ret;
			
 
				-		iov_iter_advance(from, ret);
			
 
				-	}
			
 
				 out:
			
 
				 	xfs_rw_iunlock(ip, iolock);
			
 
				-	return ret;
			
 
				+	return error ? error : ret;
			
 
				 }
			
 
				 
			
 
				 STATIC ssize_t
			
@@ -1514,7 +1469,7 @@ xfs_filemap_page_mkwrite(
 
				 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
 
				 
			
 
				 	if (IS_DAX(inode)) {
			
 
				-		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
			
 
				+		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
			
 
				 	} else {
			
 
				 		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
			
 
				 		ret = block_page_mkwrite_return(ret);
			
@@ -1548,7 +1503,7 @@ xfs_filemap_fault(
 
				 		 * changes to xfs_get_blocks_direct() to map unwritten extent
			
 
				 		 * ioend for conversion on read-only mappings.
			
 
				 		 */
			
 
				-		ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
			
 
				+		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
			
 
				 	} else
			
 
				 		ret = filemap_fault(vma, vmf);
			
 
				 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -934,11 +934,13 @@ xfs_iomap_write_unwritten(
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
			
 
				+static inline bool imap_needs_alloc(struct inode *inode,
			
 
				+		struct xfs_bmbt_irec *imap, int nimaps)
			
 
				 {
			
 
				 	return !nimaps ||
			
 
				 		imap->br_startblock == HOLESTARTBLOCK ||
			
 
				-		imap->br_startblock == DELAYSTARTBLOCK;
			
 
				+		imap->br_startblock == DELAYSTARTBLOCK ||
			
 
				+		(IS_DAX(inode) && ISUNWRITTEN(imap));
			
 
				 }
			
 
				 
			
 
				 static int
			
@@ -954,16 +956,18 @@ xfs_file_iomap_begin(
 
				 	struct xfs_bmbt_irec	imap;
			
 
				 	xfs_fileoff_t		offset_fsb, end_fsb;
			
 
				 	int			nimaps = 1, error = 0;
			
 
				+	unsigned		lockmode;
			
 
				 
			
 
				 	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				 		return -EIO;
			
 
				 
			
 
				-	if ((flags & IOMAP_WRITE) && !xfs_get_extsz_hint(ip)) {
			
 
				+	if ((flags & IOMAP_WRITE) &&
			
 
				+	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
			
 
				 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
			
 
				 				iomap);
			
 
				 	}
			
 
				 
			
 
				-	xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+	lockmode = xfs_ilock_data_map_shared(ip);
			
 
				 
			
 
				 	ASSERT(offset <= mp->m_super->s_maxbytes);
			
 
				 	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
			
@@ -974,11 +978,11 @@ xfs_file_iomap_begin(
 
				 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
			
 
				 			       &nimaps, XFS_BMAPI_ENTIRE);
			
 
				 	if (error) {
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		xfs_iunlock(ip, lockmode);
			
 
				 		return error;
			
 
				 	}
			
 
				 
			
 
				-	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
			
 
				+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
			
 
				 		/*
			
 
				 		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
			
 
				 		 * pages to keep the chunks of work done where somewhat symmetric
			
@@ -994,17 +998,19 @@ xfs_file_iomap_begin(
 
				 		 * xfs_iomap_write_direct() expects the shared lock. It
			
 
				 		 * is unlocked on return.
			
 
				 		 */
			
 
				-		xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
			
 
				+		if (lockmode == XFS_ILOCK_EXCL)
			
 
				+			xfs_ilock_demote(ip, lockmode);
			
 
				 		error = xfs_iomap_write_direct(ip, offset, length, &imap,
			
 
				 				nimaps);
			
 
				 		if (error)
			
 
				 			return error;
			
 
				 
			
 
				+		iomap->flags = IOMAP_F_NEW;
			
 
				 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
			
 
				 	} else {
			
 
				 		ASSERT(nimaps);
			
 
				 
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+		xfs_iunlock(ip, lockmode);
			
 
				 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
			
 
				 	}
			
 
				 
			
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -6,13 +6,19 @@
 
				 #include <linux/radix-tree.h>
			
 
				 #include <asm/pgtable.h>
			
 
				 
			
 
				+struct iomap_ops;
			
 
				+
			
 
				 /* We use lowest available exceptional entry bit for locking */
			
 
				 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
			
 
				 
			
 
				+ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
			
 
				+		struct iomap_ops *ops);
			
 
				 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
			
 
				 		  get_block_t, dio_iodone_t, int flags);
			
 
				 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
			
 
				 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
			
 
				+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			
 
				+			struct iomap_ops *ops);
			
 
				 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
			
 
				 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
			
 
				 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
			
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -23,6 +23,7 @@ struct vm_fault;
 
				  */
			
 
				 #define IOMAP_F_MERGED	0x01	/* contains multiple blocks/extents */
			
 
				 #define IOMAP_F_SHARED	0x02	/* block shared with another file */
			
 
				+#define IOMAP_F_NEW	0x04	/* blocks have been newly allocated */
			
 
				 
			
 
				 /*
			
 
				  * Magic value for blkno: