8 years ago · dd936e4313
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
 
				 Filesystem support consists of
			
 
				 - adding support to mark inodes as being DAX by setting the S_DAX flag in
			
 
				   i_flags
			
 
				-- implementing the direct_IO address space operation, and calling
			
 
				-  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
			
 
				+- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
			
 
				+  when inode has S_DAX flag set
			
 
				 - implementing an mmap file operation for DAX files which sets the
			
 
				   VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
			
 
				-  include handlers for fault, pmd_fault and page_mkwrite (which should
			
 
				-  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
			
 
				-  appropriate get_block() callback)
			
 
				-- calling dax_truncate_page() instead of block_truncate_page() for DAX files
			
 
				-- calling dax_zero_page_range() instead of zero_user() for DAX files
			
 
				+  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
			
 
				+  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
			
 
				+  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
			
 
				+  iomap operations.
			
 
				+- calling iomap_zero_range() passing appropriate iomap operations instead of
			
 
				+  block_truncate_page() for DAX files
			
 
				 - ensuring that there is sufficient locking between reads, writes,
			
 
				   truncates and page faults
			
 
				 
			
 
				-The get_block() callback passed to the DAX functions may return
			
 
				-uninitialised extents.  If it does, it must ensure that simultaneous
			
 
				-calls to get_block() (for example by a page-fault racing with a read()
			
 
				-or a write()) work correctly.
			
 
				+The iomap handlers for allocating blocks must make sure that allocated blocks
			
 
				+are zeroed out and converted to written extents before being returned to avoid
			
 
				+exposure of uninitialized data through mmap.
			
 
				 
			
 
				 These filesystems may be used for inspiration:
			
 
				 - ext2: see Documentation/filesystems/ext2.txt
			
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-static bool buffer_written(struct buffer_head *bh)
			
 
				-{
			
 
				-	return buffer_mapped(bh) && !buffer_unwritten(bh);
			
 
				-}
			
 
				-
			
 
				-static sector_t to_sector(const struct buffer_head *bh,
			
 
				-		const struct inode *inode)
			
 
				-{
			
 
				-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
			
 
				-
			
 
				-	return sector;
			
 
				-}
			
 
				-
			
 
				-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
			
 
				-		      loff_t start, loff_t end, get_block_t get_block,
			
 
				-		      struct buffer_head *bh)
			
 
				-{
			
 
				-	loff_t pos = start, max = start, bh_max = start;
			
 
				-	bool hole = false;
			
 
				-	struct block_device *bdev = NULL;
			
 
				-	int rw = iov_iter_rw(iter), rc;
			
 
				-	long map_len = 0;
			
 
				-	struct blk_dax_ctl dax = {
			
 
				-		.addr = ERR_PTR(-EIO),
			
 
				-	};
			
 
				-	unsigned blkbits = inode->i_blkbits;
			
 
				-	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
			
 
				-								>> blkbits;
			
 
				-
			
 
				-	if (rw == READ)
			
 
				-		end = min(end, i_size_read(inode));
			
 
				-
			
 
				-	while (pos < end) {
			
 
				-		size_t len;
			
 
				-		if (pos == max) {
			
 
				-			long page = pos >> PAGE_SHIFT;
			
 
				-			sector_t block = page << (PAGE_SHIFT - blkbits);
			
 
				-			unsigned first = pos - (block << blkbits);
			
 
				-			long size;
			
 
				-
			
 
				-			if (pos == bh_max) {
			
 
				-				bh->b_size = PAGE_ALIGN(end - pos);
			
 
				-				bh->b_state = 0;
			
 
				-				rc = get_block(inode, block, bh, rw == WRITE);
			
 
				-				if (rc)
			
 
				-					break;
			
 
				-				bh_max = pos - first + bh->b_size;
			
 
				-				bdev = bh->b_bdev;
			
 
				-				/*
			
 
				-				 * We allow uninitialized buffers for writes
			
 
				-				 * beyond EOF as those cannot race with faults
			
 
				-				 */
			
 
				-				WARN_ON_ONCE(
			
 
				-					(buffer_new(bh) && block < file_blks) ||
			
 
				-					(rw == WRITE && buffer_unwritten(bh)));
			
 
				-			} else {
			
 
				-				unsigned done = bh->b_size -
			
 
				-						(bh_max - (pos - first));
			
 
				-				bh->b_blocknr += done >> blkbits;
			
 
				-				bh->b_size -= done;
			
 
				-			}
			
 
				-
			
 
				-			hole = rw == READ && !buffer_written(bh);
			
 
				-			if (hole) {
			
 
				-				size = bh->b_size - first;
			
 
				-			} else {
			
 
				-				dax_unmap_atomic(bdev, &dax);
			
 
				-				dax.sector = to_sector(bh, inode);
			
 
				-				dax.size = bh->b_size;
			
 
				-				map_len = dax_map_atomic(bdev, &dax);
			
 
				-				if (map_len < 0) {
			
 
				-					rc = map_len;
			
 
				-					break;
			
 
				-				}
			
 
				-				dax.addr += first;
			
 
				-				size = map_len - first;
			
 
				-			}
			
 
				-			/*
			
 
				-			 * pos + size is one past the last offset for IO,
			
 
				-			 * so pos + size can overflow loff_t at extreme offsets.
			
 
				-			 * Cast to u64 to catch this and get the true minimum.
			
 
				-			 */
			
 
				-			max = min_t(u64, pos + size, end);
			
 
				-		}
			
 
				-
			
 
				-		if (iov_iter_rw(iter) == WRITE) {
			
 
				-			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
			
 
				-		} else if (!hole)
			
 
				-			len = copy_to_iter((void __force *) dax.addr, max - pos,
			
 
				-					iter);
			
 
				-		else
			
 
				-			len = iov_iter_zero(max - pos, iter);
			
 
				-
			
 
				-		if (!len) {
			
 
				-			rc = -EFAULT;
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		pos += len;
			
 
				-		if (!IS_ERR(dax.addr))
			
 
				-			dax.addr += len;
			
 
				-	}
			
 
				-
			
 
				-	dax_unmap_atomic(bdev, &dax);
			
 
				-
			
 
				-	return (pos == start) ? rc : pos - start;
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * dax_do_io - Perform I/O to a DAX file
			
 
				- * @iocb: The control block for this I/O
			
 
				- * @inode: The file which the I/O is directed at
			
 
				- * @iter: The addresses to do I/O from or to
			
 
				- * @get_block: The filesystem method used to translate file offsets to blocks
			
 
				- * @end_io: A filesystem callback for I/O completion
			
 
				- * @flags: See below
			
 
				- *
			
 
				- * This function uses the same locking scheme as do_blockdev_direct_IO:
			
 
				- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
			
 
				- * caller for writes.  For reads, we take and release the i_mutex ourselves.
			
 
				- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
			
 
				- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
			
 
				- * is in progress.
			
 
				- */
			
 
				-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
			
 
				-		  struct iov_iter *iter, get_block_t get_block,
			
 
				-		  dio_iodone_t end_io, int flags)
			
 
				-{
			
 
				-	struct buffer_head bh;
			
 
				-	ssize_t retval = -EINVAL;
			
 
				-	loff_t pos = iocb->ki_pos;
			
 
				-	loff_t end = pos + iov_iter_count(iter);
			
 
				-
			
 
				-	memset(&bh, 0, sizeof(bh));
			
 
				-	bh.b_bdev = inode->i_sb->s_bdev;
			
 
				-
			
 
				-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
			
 
				-		inode_lock(inode);
			
 
				-
			
 
				-	/* Protects against truncate */
			
 
				-	if (!(flags & DIO_SKIP_DIO_COUNT))
			
 
				-		inode_dio_begin(inode);
			
 
				-
			
 
				-	retval = dax_io(inode, iter, pos, end, get_block, &bh);
			
 
				-
			
 
				-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
			
 
				-		inode_unlock(inode);
			
 
				-
			
 
				-	if (end_io) {
			
 
				-		int err;
			
 
				-
			
 
				-		err = end_io(iocb, pos, retval, bh.b_private);
			
 
				-		if (err)
			
 
				-			retval = err;
			
 
				-	}
			
 
				-
			
 
				-	if (!(flags & DIO_SKIP_DIO_COUNT))
			
 
				-		inode_dio_end(inode);
			
 
				-	return retval;
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(dax_do_io);
			
 
				-
			
 
				 /*
			
 
				  * DAX radix tree locking
			
 
				  */
			
@@ -919,105 +757,6 @@ static int dax_insert_mapping(struct address_space *mapping,
 
				 	return vm_insert_mixed(vma, vaddr, dax.pfn);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * dax_fault - handle a page fault on a DAX file
			
 
				- * @vma: The virtual memory area where the fault occurred
			
 
				- * @vmf: The description of the fault
			
 
				- * @get_block: The filesystem method used to translate file offsets to blocks
			
 
				- *
			
 
				- * When a page fault occurs, filesystems may call this helper in their
			
 
				- * fault handler for DAX files. dax_fault() assumes the caller has done all
			
 
				- * the necessary locking for the page fault to proceed successfully.
			
 
				- */
			
 
				-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			
 
				-			get_block_t get_block)
			
 
				-{
			
 
				-	struct file *file = vma->vm_file;
			
 
				-	struct address_space *mapping = file->f_mapping;
			
 
				-	struct inode *inode = mapping->host;
			
 
				-	void *entry;
			
 
				-	struct buffer_head bh;
			
 
				-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
			
 
				-	unsigned blkbits = inode->i_blkbits;
			
 
				-	sector_t block;
			
 
				-	pgoff_t size;
			
 
				-	int error;
			
 
				-	int major = 0;
			
 
				-
			
 
				-	/*
			
 
				-	 * Check whether offset isn't beyond end of file now. Caller is supposed
			
 
				-	 * to hold locks serializing us with truncate / punch hole so this is
			
 
				-	 * a reliable test.
			
 
				-	 */
			
 
				-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
			
 
				-	if (vmf->pgoff >= size)
			
 
				-		return VM_FAULT_SIGBUS;
			
 
				-
			
 
				-	memset(&bh, 0, sizeof(bh));
			
 
				-	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
			
 
				-	bh.b_bdev = inode->i_sb->s_bdev;
			
 
				-	bh.b_size = PAGE_SIZE;
			
 
				-
			
 
				-	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
			
 
				-	if (IS_ERR(entry)) {
			
 
				-		error = PTR_ERR(entry);
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	error = get_block(inode, block, &bh, 0);
			
 
				-	if (!error && (bh.b_size < PAGE_SIZE))
			
 
				-		error = -EIO;		/* fs corruption? */
			
 
				-	if (error)
			
 
				-		goto unlock_entry;
			
 
				-
			
 
				-	if (vmf->cow_page) {
			
 
				-		struct page *new_page = vmf->cow_page;
			
 
				-		if (buffer_written(&bh))
			
 
				-			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
			
 
				-					bh.b_size, new_page, vaddr);
			
 
				-		else
			
 
				-			clear_user_highpage(new_page, vaddr);
			
 
				-		if (error)
			
 
				-			goto unlock_entry;
			
 
				-		if (!radix_tree_exceptional_entry(entry)) {
			
 
				-			vmf->page = entry;
			
 
				-			return VM_FAULT_LOCKED;
			
 
				-		}
			
 
				-		vmf->entry = entry;
			
 
				-		return VM_FAULT_DAX_LOCKED;
			
 
				-	}
			
 
				-
			
 
				-	if (!buffer_mapped(&bh)) {
			
 
				-		if (vmf->flags & FAULT_FLAG_WRITE) {
			
 
				-			error = get_block(inode, block, &bh, 1);
			
 
				-			count_vm_event(PGMAJFAULT);
			
 
				-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			
 
				-			major = VM_FAULT_MAJOR;
			
 
				-			if (!error && (bh.b_size < PAGE_SIZE))
			
 
				-				error = -EIO;
			
 
				-			if (error)
			
 
				-				goto unlock_entry;
			
 
				-		} else {
			
 
				-			return dax_load_hole(mapping, entry, vmf);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* Filesystem should not return unwritten buffers to us! */
			
 
				-	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
			
 
				-	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
			
 
				-			bh.b_size, &entry, vma, vmf);
			
 
				- unlock_entry:
			
 
				-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
			
 
				- out:
			
 
				-	if (error == -ENOMEM)
			
 
				-		return VM_FAULT_OOM | major;
			
 
				-	/* -EBUSY is fine, somebody else faulted on the same PTE */
			
 
				-	if ((error < 0) && (error != -EBUSY))
			
 
				-		return VM_FAULT_SIGBUS | major;
			
 
				-	return VM_FAULT_NOPAGE | major;
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(dax_fault);
			
 
				-
			
 
				 /**
			
 
				  * dax_pfn_mkwrite - handle first write to DAX page
			
 
				  * @vma: The virtual memory area where the fault occurred
			
@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
			
 
				 
			
 
				-/**
			
 
				- * dax_zero_page_range - zero a range within a page of a DAX file
			
 
				- * @inode: The file being truncated
			
 
				- * @from: The file offset that is being truncated to
			
 
				- * @length: The number of bytes to zero
			
 
				- * @get_block: The filesystem method used to translate file offsets to blocks
			
 
				- *
			
 
				- * This function can be called by a filesystem when it is zeroing part of a
			
 
				- * page in a DAX file.  This is intended for hole-punch operations.  If
			
 
				- * you are truncating a file, the helper function dax_truncate_page() may be
			
 
				- * more convenient.
			
 
				- */
			
 
				-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
			
 
				-							get_block_t get_block)
			
 
				-{
			
 
				-	struct buffer_head bh;
			
 
				-	pgoff_t index = from >> PAGE_SHIFT;
			
 
				-	unsigned offset = from & (PAGE_SIZE-1);
			
 
				-	int err;
			
 
				-
			
 
				-	/* Block boundary? Nothing to do */
			
 
				-	if (!length)
			
 
				-		return 0;
			
 
				-	if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	memset(&bh, 0, sizeof(bh));
			
 
				-	bh.b_bdev = inode->i_sb->s_bdev;
			
 
				-	bh.b_size = PAGE_SIZE;
			
 
				-	err = get_block(inode, index, &bh, 0);
			
 
				-	if (err < 0 || !buffer_written(&bh))
			
 
				-		return err;
			
 
				-
			
 
				-	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
			
 
				-			offset, length);
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(dax_zero_page_range);
			
 
				-
			
 
				-/**
			
 
				- * dax_truncate_page - handle a partial page being truncated in a DAX file
			
 
				- * @inode: The file being truncated
			
 
				- * @from: The file offset that is being truncated to
			
 
				- * @get_block: The filesystem method used to translate file offsets to blocks
			
 
				- *
			
 
				- * Similar to block_truncate_page(), this function can be called by a
			
 
				- * filesystem when it is truncating a DAX file to handle the partial page.
			
 
				- */
			
 
				-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
			
 
				-{
			
 
				-	unsigned length = PAGE_ALIGN(from) - from;
			
 
				-	return dax_zero_page_range(inode, from, length, get_block);
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(dax_truncate_page);
			
 
				-
			
 
				 #ifdef CONFIG_FS_IOMAP
			
 
				 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
			
 
				 {
			
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
				 
			
 
				 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
			
 
				 		struct iomap_ops *ops);
			
 
				-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
			
 
				-		  get_block_t, dio_iodone_t, int flags);
			
 
				-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
			
 
				-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
			
 
				 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			
 
				 			struct iomap_ops *ops);
			
 
				-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
			
 
				 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
			
 
				 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
			
 
				 		pgoff_t index, void *entry, bool wake_all);
			
@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
			
 
				-				pmd_t *pmd, unsigned int flags, get_block_t gb)
			
 
				-{
			
 
				-	return VM_FAULT_FALLBACK;
			
 
				-}
			
 
				-
			
 
				 #ifdef CONFIG_FS_DAX_PMD
			
 
				 static inline unsigned int dax_radix_order(void *entry)
			
 
				 {
			
@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
 
				 }
			
 
				 #endif
			
 
				 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
			
 
				-#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
			
 
				 
			
 
				 static inline bool vma_is_dax(struct vm_area_struct *vma)
			
 
				 {