9 年之前 · 478a1469a7
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -52,6 +52,7 @@ config FS_DAX_PMD
 
				 	depends on FS_DAX
			
 
				 	depends on ZONE_DEVICE
			
 
				 	depends on TRANSPARENT_HUGEPAGE
			
 
				+	depends on BROKEN
			
 
				 
			
 
				 endif # BLOCK
			
 
				 
			
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,14 +32,43 @@
 
				 #include <linux/pfn_t.h>
			
 
				 #include <linux/sizes.h>
			
 
				 
			
 
				-#define RADIX_DAX_MASK	0xf
			
 
				-#define RADIX_DAX_SHIFT	4
			
 
				-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
			
 
				-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
			
 
				-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
			
 
				+/*
			
 
				+ * We use lowest available bit in exceptional entry for locking, other two
			
 
				+ * bits to determine entry type. In total 3 special bits.
			
 
				+ */
			
 
				+#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
			
 
				+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
			
 
				+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
			
 
				+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
			
 
				+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
			
 
				 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
			
 
				 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
			
 
				-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
			
 
				+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
			
 
				+		RADIX_TREE_EXCEPTIONAL_ENTRY))
			
 
				+
			
 
				+/* We choose 4096 entries - same as per-zone page wait tables */
			
 
				+#define DAX_WAIT_TABLE_BITS 12
			
 
				+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
			
 
				+
			
 
				+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
			
 
				+
			
 
				+static int __init init_dax_wait_table(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
			
 
				+		init_waitqueue_head(wait_table + i);
			
 
				+	return 0;
			
 
				+}
			
 
				+fs_initcall(init_dax_wait_table);
			
 
				+
			
 
				+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
			
 
				+					      pgoff_t index)
			
 
				+{
			
 
				+	unsigned long hash = hash_long((unsigned long)mapping ^ index,
			
 
				+				       DAX_WAIT_TABLE_BITS);
			
 
				+	return wait_table + hash;
			
 
				+}
			
 
				 
			
 
				 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
			
 
				 {
			
@@ -262,6 +291,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dax_do_io);
			
 
				 
			
 
				+/*
			
 
				+ * DAX radix tree locking
			
 
				+ */
			
 
				+struct exceptional_entry_key {
			
 
				+	struct address_space *mapping;
			
 
				+	unsigned long index;
			
 
				+};
			
 
				+
			
 
				+struct wait_exceptional_entry_queue {
			
 
				+	wait_queue_t wait;
			
 
				+	struct exceptional_entry_key key;
			
 
				+};
			
 
				+
			
 
				+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
			
 
				+				       int sync, void *keyp)
			
 
				+{
			
 
				+	struct exceptional_entry_key *key = keyp;
			
 
				+	struct wait_exceptional_entry_queue *ewait =
			
 
				+		container_of(wait, struct wait_exceptional_entry_queue, wait);
			
 
				+
			
 
				+	if (key->mapping != ewait->key.mapping ||
			
 
				+	    key->index != ewait->key.index)
			
 
				+		return 0;
			
 
				+	return autoremove_wake_function(wait, mode, sync, NULL);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check whether the given slot is locked. The function must be called with
			
 
				+ * mapping->tree_lock held
			
 
				+ */
			
 
				+static inline int slot_locked(struct address_space *mapping, void **slot)
			
 
				+{
			
 
				+	unsigned long entry = (unsigned long)
			
 
				+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
			
 
				+	return entry & RADIX_DAX_ENTRY_LOCK;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Mark the given slot is locked. The function must be called with
			
 
				+ * mapping->tree_lock held
			
 
				+ */
			
 
				+static inline void *lock_slot(struct address_space *mapping, void **slot)
			
 
				+{
			
 
				+	unsigned long entry = (unsigned long)
			
 
				+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
			
 
				+
			
 
				+	entry |= RADIX_DAX_ENTRY_LOCK;
			
 
				+	radix_tree_replace_slot(slot, (void *)entry);
			
 
				+	return (void *)entry;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Mark the given slot is unlocked. The function must be called with
			
 
				+ * mapping->tree_lock held
			
 
				+ */
			
 
				+static inline void *unlock_slot(struct address_space *mapping, void **slot)
			
 
				+{
			
 
				+	unsigned long entry = (unsigned long)
			
 
				+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
			
 
				+
			
 
				+	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
			
 
				+	radix_tree_replace_slot(slot, (void *)entry);
			
 
				+	return (void *)entry;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Lookup entry in radix tree, wait for it to become unlocked if it is
			
 
				+ * exceptional entry and return it. The caller must call
			
 
				+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
			
 
				+ * put_locked_mapping_entry() when he locked the entry and now wants to
			
 
				+ * unlock it.
			
 
				+ *
			
 
				+ * The function must be called with mapping->tree_lock held.
			
 
				+ */
			
 
				+static void *get_unlocked_mapping_entry(struct address_space *mapping,
			
 
				+					pgoff_t index, void ***slotp)
			
 
				+{
			
 
				+	void *ret, **slot;
			
 
				+	struct wait_exceptional_entry_queue ewait;
			
 
				+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
			
 
				+
			
 
				+	init_wait(&ewait.wait);
			
 
				+	ewait.wait.func = wake_exceptional_entry_func;
			
 
				+	ewait.key.mapping = mapping;
			
 
				+	ewait.key.index = index;
			
 
				+
			
 
				+	for (;;) {
			
 
				+		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
			
 
				+					  &slot);
			
 
				+		if (!ret || !radix_tree_exceptional_entry(ret) ||
			
 
				+		    !slot_locked(mapping, slot)) {
			
 
				+			if (slotp)
			
 
				+				*slotp = slot;
			
 
				+			return ret;
			
 
				+		}
			
 
				+		prepare_to_wait_exclusive(wq, &ewait.wait,
			
 
				+					  TASK_UNINTERRUPTIBLE);
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		schedule();
			
 
				+		finish_wait(wq, &ewait.wait);
			
 
				+		spin_lock_irq(&mapping->tree_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Find radix tree entry at given index. If it points to a page, return with
			
 
				+ * the page locked. If it points to the exceptional entry, return with the
			
 
				+ * radix tree entry locked. If the radix tree doesn't contain given index,
			
 
				+ * create empty exceptional entry for the index and return with it locked.
			
 
				+ *
			
 
				+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
			
 
				+ * persistent memory the benefit is doubtful. We can add that later if we can
			
 
				+ * show it helps.
			
 
				+ */
			
 
				+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
			
 
				+{
			
 
				+	void *ret, **slot;
			
 
				+
			
 
				+restart:
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	ret = get_unlocked_mapping_entry(mapping, index, &slot);
			
 
				+	/* No entry for given index? Make sure radix tree is big enough. */
			
 
				+	if (!ret) {
			
 
				+		int err;
			
 
				+
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		err = radix_tree_preload(
			
 
				+				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
			
 
				+		if (err)
			
 
				+			return ERR_PTR(err);
			
 
				+		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
			
 
				+			       RADIX_DAX_ENTRY_LOCK);
			
 
				+		spin_lock_irq(&mapping->tree_lock);
			
 
				+		err = radix_tree_insert(&mapping->page_tree, index, ret);
			
 
				+		radix_tree_preload_end();
			
 
				+		if (err) {
			
 
				+			spin_unlock_irq(&mapping->tree_lock);
			
 
				+			/* Someone already created the entry? */
			
 
				+			if (err == -EEXIST)
			
 
				+				goto restart;
			
 
				+			return ERR_PTR(err);
			
 
				+		}
			
 
				+		/* Good, we have inserted empty locked entry into the tree. */
			
 
				+		mapping->nrexceptional++;
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		return ret;
			
 
				+	}
			
 
				+	/* Normal page in radix tree? */
			
 
				+	if (!radix_tree_exceptional_entry(ret)) {
			
 
				+		struct page *page = ret;
			
 
				+
			
 
				+		get_page(page);
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		lock_page(page);
			
 
				+		/* Page got truncated? Retry... */
			
 
				+		if (unlikely(page->mapping != mapping)) {
			
 
				+			unlock_page(page);
			
 
				+			put_page(page);
			
 
				+			goto restart;
			
 
				+		}
			
 
				+		return page;
			
 
				+	}
			
 
				+	ret = lock_slot(mapping, slot);
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
			
 
				+				   pgoff_t index, bool wake_all)
			
 
				+{
			
 
				+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
			
 
				+
			
 
				+	/*
			
 
				+	 * Checking for locked entry and prepare_to_wait_exclusive() happens
			
 
				+	 * under mapping->tree_lock, ditto for entry handling in our callers.
			
 
				+	 * So at this point all tasks that could have seen our entry locked
			
 
				+	 * must be in the waitqueue and the following check will see them.
			
 
				+	 */
			
 
				+	if (waitqueue_active(wq)) {
			
 
				+		struct exceptional_entry_key key;
			
 
				+
			
 
				+		key.mapping = mapping;
			
 
				+		key.index = index;
			
 
				+		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
			
 
				+{
			
 
				+	void *ret, **slot;
			
 
				+
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
			
 
				+	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
			
 
				+			 !slot_locked(mapping, slot))) {
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		return;
			
 
				+	}
			
 
				+	unlock_slot(mapping, slot);
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	dax_wake_mapping_entry_waiter(mapping, index, false);
			
 
				+}
			
 
				+
			
 
				+static void put_locked_mapping_entry(struct address_space *mapping,
			
 
				+				     pgoff_t index, void *entry)
			
 
				+{
			
 
				+	if (!radix_tree_exceptional_entry(entry)) {
			
 
				+		unlock_page(entry);
			
 
				+		put_page(entry);
			
 
				+	} else {
			
 
				+		dax_unlock_mapping_entry(mapping, index);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called when we are done with radix tree entry we looked up via
			
 
				+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
			
 
				+ */
			
 
				+static void put_unlocked_mapping_entry(struct address_space *mapping,
			
 
				+				       pgoff_t index, void *entry)
			
 
				+{
			
 
				+	if (!radix_tree_exceptional_entry(entry))
			
 
				+		return;
			
 
				+
			
 
				+	/* We have to wake up next waiter for the radix tree entry lock */
			
 
				+	dax_wake_mapping_entry_waiter(mapping, index, false);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
			
 
				+ * entry to get unlocked before deleting it.
			
 
				+ */
			
 
				+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
			
 
				+{
			
 
				+	void *entry;
			
 
				+
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
			
 
				+	/*
			
 
				+	 * This gets called from truncate / punch_hole path. As such, the caller
			
 
				+	 * must hold locks protecting against concurrent modifications of the
			
 
				+	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
			
 
				+	 * caller has seen exceptional entry for this index, we better find it
			
 
				+	 * at that index as well...
			
 
				+	 */
			
 
				+	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	radix_tree_delete(&mapping->page_tree, index);
			
 
				+	mapping->nrexceptional--;
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	dax_wake_mapping_entry_waiter(mapping, index, true);
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * The user has performed a load from a hole in the file.  Allocating
			
 
				  * a new page in the file would cause excessive storage usage for
			
@@ -270,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
 
				  * otherwise it will simply fall out of the page cache under memory
			
 
				  * pressure without ever having been dirtied.
			
 
				  */
			
 
				-static int dax_load_hole(struct address_space *mapping, struct page *page,
			
 
				-							struct vm_fault *vmf)
			
 
				+static int dax_load_hole(struct address_space *mapping, void *entry,
			
 
				+			 struct vm_fault *vmf)
			
 
				 {
			
 
				-	if (!page)
			
 
				-		page = find_or_create_page(mapping, vmf->pgoff,
			
 
				-						GFP_KERNEL | __GFP_ZERO);
			
 
				-	if (!page)
			
 
				-		return VM_FAULT_OOM;
			
 
				+	struct page *page;
			
 
				 
			
 
				+	/* Hole page already exists? Return it...  */
			
 
				+	if (!radix_tree_exceptional_entry(entry)) {
			
 
				+		vmf->page = entry;
			
 
				+		return VM_FAULT_LOCKED;
			
 
				+	}
			
 
				+
			
 
				+	/* This will replace locked radix tree entry with a hole page */
			
 
				+	page = find_or_create_page(mapping, vmf->pgoff,
			
 
				+				   vmf->gfp_mask | __GFP_ZERO);
			
 
				+	if (!page) {
			
 
				+		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
			
 
				+		return VM_FAULT_OOM;
			
 
				+	}
			
 
				 	vmf->page = page;
			
 
				 	return VM_FAULT_LOCKED;
			
 
				 }
			
@@ -302,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-#define NO_SECTOR -1
			
 
				 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
			
 
				 
			
 
				-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
			
 
				-		sector_t sector, bool pmd_entry, bool dirty)
			
 
				+static void *dax_insert_mapping_entry(struct address_space *mapping,
			
 
				+				      struct vm_fault *vmf,
			
 
				+				      void *entry, sector_t sector)
			
 
				 {
			
 
				 	struct radix_tree_root *page_tree = &mapping->page_tree;
			
 
				-	pgoff_t pmd_index = DAX_PMD_INDEX(index);
			
 
				-	int type, error = 0;
			
 
				-	void *entry;
			
 
				+	int error = 0;
			
 
				+	bool hole_fill = false;
			
 
				+	void *new_entry;
			
 
				+	pgoff_t index = vmf->pgoff;
			
 
				 
			
 
				-	WARN_ON_ONCE(pmd_entry && !dirty);
			
 
				-	if (dirty)
			
 
				+	if (vmf->flags & FAULT_FLAG_WRITE)
			
 
				 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
			
 
				 
			
 
				-	spin_lock_irq(&mapping->tree_lock);
			
 
				-
			
 
				-	entry = radix_tree_lookup(page_tree, pmd_index);
			
 
				-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
			
 
				-		index = pmd_index;
			
 
				-		goto dirty;
			
 
				+	/* Replacing hole page with block mapping? */
			
 
				+	if (!radix_tree_exceptional_entry(entry)) {
			
 
				+		hole_fill = true;
			
 
				+		/*
			
 
				+		 * Unmap the page now before we remove it from page cache below.
			
 
				+		 * The page is locked so it cannot be faulted in again.
			
 
				+		 */
			
 
				+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
			
 
				+				    PAGE_SIZE, 0);
			
 
				+		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
			
 
				+		if (error)
			
 
				+			return ERR_PTR(error);
			
 
				 	}
			
 
				 
			
 
				-	entry = radix_tree_lookup(page_tree, index);
			
 
				-	if (entry) {
			
 
				-		type = RADIX_DAX_TYPE(entry);
			
 
				-		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
			
 
				-					type != RADIX_DAX_PMD)) {
			
 
				-			error = -EIO;
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
			
 
				+		       RADIX_DAX_ENTRY_LOCK);
			
 
				+	if (hole_fill) {
			
 
				+		__delete_from_page_cache(entry, NULL);
			
 
				+		/* Drop pagecache reference */
			
 
				+		put_page(entry);
			
 
				+		error = radix_tree_insert(page_tree, index, new_entry);
			
 
				+		if (error) {
			
 
				+			new_entry = ERR_PTR(error);
			
 
				 			goto unlock;
			
 
				 		}
			
 
				+		mapping->nrexceptional++;
			
 
				+	} else {
			
 
				+		void **slot;
			
 
				+		void *ret;
			
 
				 
			
 
				-		if (!pmd_entry || type == RADIX_DAX_PMD)
			
 
				-			goto dirty;
			
 
				-
			
 
				-		/*
			
 
				-		 * We only insert dirty PMD entries into the radix tree.  This
			
 
				-		 * means we don't need to worry about removing a dirty PTE
			
 
				-		 * entry and inserting a clean PMD entry, thus reducing the
			
 
				-		 * range we would flush with a follow-up fsync/msync call.
			
 
				-		 */
			
 
				-		radix_tree_delete(&mapping->page_tree, index);
			
 
				-		mapping->nrexceptional--;
			
 
				-	}
			
 
				-
			
 
				-	if (sector == NO_SECTOR) {
			
 
				-		/*
			
 
				-		 * This can happen during correct operation if our pfn_mkwrite
			
 
				-		 * fault raced against a hole punch operation.  If this
			
 
				-		 * happens the pte that was hole punched will have been
			
 
				-		 * unmapped and the radix tree entry will have been removed by
			
 
				-		 * the time we are called, but the call will still happen.  We
			
 
				-		 * will return all the way up to wp_pfn_shared(), where the
			
 
				-		 * pte_same() check will fail, eventually causing page fault
			
 
				-		 * to be retried by the CPU.
			
 
				-		 */
			
 
				-		goto unlock;
			
 
				+		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
			
 
				+		WARN_ON_ONCE(ret != entry);
			
 
				+		radix_tree_replace_slot(slot, new_entry);
			
 
				 	}
			
 
				-
			
 
				-	error = radix_tree_insert(page_tree, index,
			
 
				-			RADIX_DAX_ENTRY(sector, pmd_entry));
			
 
				-	if (error)
			
 
				-		goto unlock;
			
 
				-
			
 
				-	mapping->nrexceptional++;
			
 
				- dirty:
			
 
				-	if (dirty)
			
 
				+	if (vmf->flags & FAULT_FLAG_WRITE)
			
 
				 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
			
 
				  unlock:
			
 
				 	spin_unlock_irq(&mapping->tree_lock);
			
 
				-	return error;
			
 
				+	if (hole_fill) {
			
 
				+		radix_tree_preload_end();
			
 
				+		/*
			
 
				+		 * We don't need hole page anymore, it has been replaced with
			
 
				+		 * locked radix tree entry now.
			
 
				+		 */
			
 
				+		if (mapping->a_ops->freepage)
			
 
				+			mapping->a_ops->freepage(entry);
			
 
				+		unlock_page(entry);
			
 
				+		put_page(entry);
			
 
				+	}
			
 
				+	return new_entry;
			
 
				 }
			
 
				 
			
 
				 static int dax_writeback_one(struct block_device *bdev,
			
@@ -498,37 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
			
 
				 
			
 
				-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
			
 
				+static int dax_insert_mapping(struct address_space *mapping,
			
 
				+			struct buffer_head *bh, void **entryp,
			
 
				 			struct vm_area_struct *vma, struct vm_fault *vmf)
			
 
				 {
			
 
				 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
			
 
				-	struct address_space *mapping = inode->i_mapping;
			
 
				 	struct block_device *bdev = bh->b_bdev;
			
 
				 	struct blk_dax_ctl dax = {
			
 
				-		.sector = to_sector(bh, inode),
			
 
				+		.sector = to_sector(bh, mapping->host),
			
 
				 		.size = bh->b_size,
			
 
				 	};
			
 
				-	int error;
			
 
				+	void *ret;
			
 
				+	void *entry = *entryp;
			
 
				 
			
 
				-	i_mmap_lock_read(mapping);
			
 
				-
			
 
				-	if (dax_map_atomic(bdev, &dax) < 0) {
			
 
				-		error = PTR_ERR(dax.addr);
			
 
				-		goto out;
			
 
				-	}
			
 
				+	if (dax_map_atomic(bdev, &dax) < 0)
			
 
				+		return PTR_ERR(dax.addr);
			
 
				 	dax_unmap_atomic(bdev, &dax);
			
 
				 
			
 
				-	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
			
 
				-			vmf->flags & FAULT_FLAG_WRITE);
			
 
				-	if (error)
			
 
				-		goto out;
			
 
				+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
			
 
				+	if (IS_ERR(ret))
			
 
				+		return PTR_ERR(ret);
			
 
				+	*entryp = ret;
			
 
				 
			
 
				-	error = vm_insert_mixed(vma, vaddr, dax.pfn);
			
 
				-
			
 
				- out:
			
 
				-	i_mmap_unlock_read(mapping);
			
 
				-
			
 
				-	return error;
			
 
				+	return vm_insert_mixed(vma, vaddr, dax.pfn);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -547,7 +829,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 	struct file *file = vma->vm_file;
			
 
				 	struct address_space *mapping = file->f_mapping;
			
 
				 	struct inode *inode = mapping->host;
			
 
				-	struct page *page;
			
 
				+	void *entry;
			
 
				 	struct buffer_head bh;
			
 
				 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
			
 
				 	unsigned blkbits = inode->i_blkbits;
			
@@ -556,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 	int error;
			
 
				 	int major = 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * Check whether offset isn't beyond end of file now. Caller is supposed
			
 
				+	 * to hold locks serializing us with truncate / punch hole so this is
			
 
				+	 * a reliable test.
			
 
				+	 */
			
 
				 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
			
 
				 	if (vmf->pgoff >= size)
			
 
				 		return VM_FAULT_SIGBUS;
			
@@ -565,27 +852,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 	bh.b_bdev = inode->i_sb->s_bdev;
			
 
				 	bh.b_size = PAGE_SIZE;
			
 
				 
			
 
				- repeat:
			
 
				-	page = find_get_page(mapping, vmf->pgoff);
			
 
				-	if (page) {
			
 
				-		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
			
 
				-			put_page(page);
			
 
				-			return VM_FAULT_RETRY;
			
 
				-		}
			
 
				-		if (unlikely(page->mapping != mapping)) {
			
 
				-			unlock_page(page);
			
 
				-			put_page(page);
			
 
				-			goto repeat;
			
 
				-		}
			
 
				+	entry = grab_mapping_entry(mapping, vmf->pgoff);
			
 
				+	if (IS_ERR(entry)) {
			
 
				+		error = PTR_ERR(entry);
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				 	error = get_block(inode, block, &bh, 0);
			
 
				 	if (!error && (bh.b_size < PAGE_SIZE))
			
 
				 		error = -EIO;		/* fs corruption? */
			
 
				 	if (error)
			
 
				-		goto unlock_page;
			
 
				+		goto unlock_entry;
			
 
				+
			
 
				+	if (vmf->cow_page) {
			
 
				+		struct page *new_page = vmf->cow_page;
			
 
				+		if (buffer_written(&bh))
			
 
				+			error = copy_user_bh(new_page, inode, &bh, vaddr);
			
 
				+		else
			
 
				+			clear_user_highpage(new_page, vaddr);
			
 
				+		if (error)
			
 
				+			goto unlock_entry;
			
 
				+		if (!radix_tree_exceptional_entry(entry)) {
			
 
				+			vmf->page = entry;
			
 
				+			return VM_FAULT_LOCKED;
			
 
				+		}
			
 
				+		vmf->entry = entry;
			
 
				+		return VM_FAULT_DAX_LOCKED;
			
 
				+	}
			
 
				 
			
 
				-	if (!buffer_mapped(&bh) && !vmf->cow_page) {
			
 
				+	if (!buffer_mapped(&bh)) {
			
 
				 		if (vmf->flags & FAULT_FLAG_WRITE) {
			
 
				 			error = get_block(inode, block, &bh, 1);
			
 
				 			count_vm_event(PGMAJFAULT);
			
@@ -594,43 +889,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 			if (!error && (bh.b_size < PAGE_SIZE))
			
 
				 				error = -EIO;
			
 
				 			if (error)
			
 
				-				goto unlock_page;
			
 
				+				goto unlock_entry;
			
 
				 		} else {
			
 
				-			return dax_load_hole(mapping, page, vmf);
			
 
				+			return dax_load_hole(mapping, entry, vmf);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (vmf->cow_page) {
			
 
				-		struct page *new_page = vmf->cow_page;
			
 
				-		if (buffer_written(&bh))
			
 
				-			error = copy_user_bh(new_page, inode, &bh, vaddr);
			
 
				-		else
			
 
				-			clear_user_highpage(new_page, vaddr);
			
 
				-		if (error)
			
 
				-			goto unlock_page;
			
 
				-		vmf->page = page;
			
 
				-		if (!page)
			
 
				-			i_mmap_lock_read(mapping);
			
 
				-		return VM_FAULT_LOCKED;
			
 
				-	}
			
 
				-
			
 
				-	/* Check we didn't race with a read fault installing a new page */
			
 
				-	if (!page && major)
			
 
				-		page = find_lock_page(mapping, vmf->pgoff);
			
 
				-
			
 
				-	if (page) {
			
 
				-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
			
 
				-							PAGE_SIZE, 0);
			
 
				-		delete_from_page_cache(page);
			
 
				-		unlock_page(page);
			
 
				-		put_page(page);
			
 
				-		page = NULL;
			
 
				-	}
			
 
				-
			
 
				 	/* Filesystem should not return unwritten buffers to us! */
			
 
				 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
			
 
				-	error = dax_insert_mapping(inode, &bh, vma, vmf);
			
 
				-
			
 
				+	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
			
 
				+ unlock_entry:
			
 
				+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
			
 
				  out:
			
 
				 	if (error == -ENOMEM)
			
 
				 		return VM_FAULT_OOM | major;
			
@@ -638,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 	if ((error < 0) && (error != -EBUSY))
			
 
				 		return VM_FAULT_SIGBUS | major;
			
 
				 	return VM_FAULT_NOPAGE | major;
			
 
				-
			
 
				- unlock_page:
			
 
				-	if (page) {
			
 
				-		unlock_page(page);
			
 
				-		put_page(page);
			
 
				-	}
			
 
				-	goto out;
			
 
				 }
			
 
				 EXPORT_SYMBOL(__dax_fault);
			
 
				 
			
@@ -675,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dax_fault);
			
 
				 
			
 
				-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
			
 
				 /*
			
 
				  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
			
 
				  * more often than one might expect in the below function.
			
@@ -713,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	struct block_device *bdev;
			
 
				 	pgoff_t size, pgoff;
			
 
				 	sector_t block;
			
 
				-	int error, result = 0;
			
 
				+	int result = 0;
			
 
				 	bool alloc = false;
			
 
				 
			
 
				 	/* dax pmd mappings require pfn_t_devmap() */
			
@@ -786,9 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 		truncate_pagecache_range(inode, lstart, lend);
			
 
				 	}
			
 
				 
			
 
				-	i_mmap_lock_read(mapping);
			
 
				-
			
 
				-	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
			
 
				+	if (!write && !buffer_mapped(&bh)) {
			
 
				 		spinlock_t *ptl;
			
 
				 		pmd_t entry;
			
 
				 		struct page *zero_page = get_huge_zero_page();
			
@@ -860,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 		 * the write to insert a dirty entry.
			
 
				 		 */
			
 
				 		if (write) {
			
 
				-			error = dax_radix_entry(mapping, pgoff, dax.sector,
			
 
				-					true, true);
			
 
				-			if (error) {
			
 
				-				dax_pmd_dbg(&bh, address,
			
 
				-						"PMD radix insertion failed");
			
 
				-				goto fallback;
			
 
				-			}
			
 
				+			/*
			
 
				+			 * We should insert radix-tree entry and dirty it here.
			
 
				+			 * For now this is broken...
			
 
				+			 */
			
 
				 		}
			
 
				 
			
 
				 		dev_dbg(part_to_dev(bdev->bd_part),
			
@@ -879,8 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	}
			
 
				 
			
 
				  out:
			
 
				-	i_mmap_unlock_read(mapping);
			
 
				-
			
 
				 	return result;
			
 
				 
			
 
				  fallback:
			
@@ -926,23 +1181,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 
				 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
			
 
				 {
			
 
				 	struct file *file = vma->vm_file;
			
 
				-	int error;
			
 
				-
			
 
				-	/*
			
 
				-	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
			
 
				-	 * RADIX_DAX_PTE entry already exists in the radix tree from a
			
 
				-	 * previous call to __dax_fault().  We just want to look up that PTE
			
 
				-	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
			
 
				-	 * saves us from having to make a call to get_block() here to look
			
 
				-	 * up the sector.
			
 
				-	 */
			
 
				-	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
			
 
				-			true);
			
 
				+	struct address_space *mapping = file->f_mapping;
			
 
				+	void *entry;
			
 
				+	pgoff_t index = vmf->pgoff;
			
 
				 
			
 
				-	if (error == -ENOMEM)
			
 
				-		return VM_FAULT_OOM;
			
 
				-	if (error)
			
 
				-		return VM_FAULT_SIGBUS;
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
			
 
				+	if (!entry || !radix_tree_exceptional_entry(entry))
			
 
				+		goto out;
			
 
				+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
			
 
				+	put_unlocked_mapping_entry(mapping, index, entry);
			
 
				+out:
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				 	return VM_FAULT_NOPAGE;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
			
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -3,17 +3,25 @@
 
				 
			
 
				 #include <linux/fs.h>
			
 
				 #include <linux/mm.h>
			
 
				+#include <linux/radix-tree.h>
			
 
				 #include <asm/pgtable.h>
			
 
				 
			
 
				+/* We use lowest available exceptional entry bit for locking */
			
 
				+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
			
 
				+
			
 
				 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
			
 
				 		  get_block_t, dio_iodone_t, int flags);
			
 
				 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
			
 
				 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
			
 
				 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
			
 
				 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
			
 
				+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
			
 
				+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
			
 
				+				   pgoff_t index, bool wake_all);
			
 
				 
			
 
				 #ifdef CONFIG_FS_DAX
			
 
				 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
			
 
				+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
			
 
				 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
			
 
				 		unsigned int offset, unsigned int length);
			
 
				 #else
			
@@ -22,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 
				 {
			
 
				 	return ERR_PTR(-ENXIO);
			
 
				 }
			
 
				+/* Shouldn't ever be called when dax is disabled. */
			
 
				+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
			
 
				+					    pgoff_t index)
			
 
				+{
			
 
				+	BUG();
			
 
				+}
			
 
				 static inline int __dax_zero_page_range(struct block_device *bdev,
			
 
				 		sector_t sector, unsigned int offset, unsigned int length)
			
 
				 {
			
@@ -29,7 +43,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
			
 
				 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
			
 
				 				unsigned int flags, get_block_t);
			
 
				 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -303,6 +303,12 @@ struct vm_fault {
 
				 					 * is set (which is also implied by
			
 
				 					 * VM_FAULT_ERROR).
			
 
				 					 */
			
 
				+	void *entry;			/* ->fault handler can alternatively
			
 
				+					 * return locked DAX entry. In that
			
 
				+					 * case handler should return
			
 
				+					 * VM_FAULT_DAX_LOCKED and fill in
			
 
				+					 * entry here.
			
 
				+					 */
			
 
				 	/* for ->map_pages() only */
			
 
				 	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
			
 
				 					 * max_pgoff inclusive */
			
@@ -1076,6 +1082,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 
				 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
			
 
				 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
			
 
				 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
			
 
				+#define VM_FAULT_DAX_LOCKED 0x1000	/* ->fault has locked DAX entry */
			
 
				 
			
 
				 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
			
 
				 
			
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
 
				 			return;
			
 
				 
			
 
				 	/*
			
 
				-	 * Track node that only contains shadow entries.
			
 
				+	 * Track node that only contains shadow entries. DAX mappings contain
			
 
				+	 * no shadow entries and may contain other exceptional entries so skip
			
 
				+	 * those.
			
 
				 	 *
			
 
				 	 * Avoid acquiring the list_lru lock if already tracked.  The
			
 
				 	 * list_empty() test is safe as node->private_list is
			
 
				 	 * protected by mapping->tree_lock.
			
 
				 	 */
			
 
				-	if (!workingset_node_pages(node) &&
			
 
				+	if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
			
 
				 	    list_empty(&node->private_list)) {
			
 
				 		node->private_data = mapping;
			
 
				 		list_lru_add(&workingset_shadow_nodes, &node->private_list);
			
@@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping,
 
				 		if (!radix_tree_exceptional_entry(p))
			
 
				 			return -EEXIST;
			
 
				 
			
 
				-		if (WARN_ON(dax_mapping(mapping)))
			
 
				-			return -EINVAL;
			
 
				-
			
 
				-		if (shadowp)
			
 
				-			*shadowp = p;
			
 
				 		mapping->nrexceptional--;
			
 
				-		if (node)
			
 
				-			workingset_node_shadows_dec(node);
			
 
				+		if (!dax_mapping(mapping)) {
			
 
				+			if (shadowp)
			
 
				+				*shadowp = p;
			
 
				+			if (node)
			
 
				+				workingset_node_shadows_dec(node);
			
 
				+		} else {
			
 
				+			/* DAX can replace empty locked entry with a hole */
			
 
				+			WARN_ON_ONCE(p !=
			
 
				+				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
			
 
				+					 RADIX_DAX_ENTRY_LOCK));
			
 
				+			/* DAX accounts exceptional entries as normal pages */
			
 
				+			if (node)
			
 
				+				workingset_node_pages_dec(node);
			
 
				+			/* Wakeup waiters for exceptional entry lock */
			
 
				+			dax_wake_mapping_entry_waiter(mapping, page->index,
			
 
				+						      false);
			
 
				+		}
			
 
				 	}
			
 
				 	radix_tree_replace_slot(slot, page);
			
 
				 	mapping->nrpages++;
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -63,6 +63,7 @@
 
				 #include <linux/dma-debug.h>
			
 
				 #include <linux/debugfs.h>
			
 
				 #include <linux/userfaultfd_k.h>
			
 
				+#include <linux/dax.h>
			
 
				 
			
 
				 #include <asm/io.h>
			
 
				 #include <asm/mmu_context.h>
			
@@ -2492,8 +2493,6 @@ void unmap_mapping_range(struct address_space *mapping,
 
				 	if (details.last_index < details.first_index)
			
 
				 		details.last_index = ULONG_MAX;
			
 
				 
			
 
				-
			
 
				-	/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
			
 
				 	i_mmap_lock_write(mapping);
			
 
				 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
			
 
				 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
			
@@ -2825,7 +2824,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				  */
			
 
				 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
			
 
				 			pgoff_t pgoff, unsigned int flags,
			
 
				-			struct page *cow_page, struct page **page)
			
 
				+			struct page *cow_page, struct page **page,
			
 
				+			void **entry)
			
 
				 {
			
 
				 	struct vm_fault vmf;
			
 
				 	int ret;
			
@@ -2840,8 +2840,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	ret = vma->vm_ops->fault(vma, &vmf);
			
 
				 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
			
 
				 		return ret;
			
 
				-	if (!vmf.page)
			
 
				-		goto out;
			
 
				+	if (ret & VM_FAULT_DAX_LOCKED) {
			
 
				+		*entry = vmf.entry;
			
 
				+		return ret;
			
 
				+	}
			
 
				 
			
 
				 	if (unlikely(PageHWPoison(vmf.page))) {
			
 
				 		if (ret & VM_FAULT_LOCKED)
			
@@ -2855,7 +2857,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	else
			
 
				 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
			
 
				 
			
 
				- out:
			
 
				 	*page = vmf.page;
			
 
				 	return ret;
			
 
				 }
			
@@ -3048,7 +3049,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		pte_unmap_unlock(pte, ptl);
			
 
				 	}
			
 
				 
			
 
				-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
			
 
				+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
			
 
				 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
			
 
				 		return ret;
			
 
				 
			
@@ -3071,6 +3072,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
			
 
				 {
			
 
				 	struct page *fault_page, *new_page;
			
 
				+	void *fault_entry;
			
 
				 	struct mem_cgroup *memcg;
			
 
				 	spinlock_t *ptl;
			
 
				 	pte_t *pte;
			
@@ -3088,26 +3090,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		return VM_FAULT_OOM;
			
 
				 	}
			
 
				 
			
 
				-	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
			
 
				+	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
			
 
				+			 &fault_entry);
			
 
				 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
			
 
				 		goto uncharge_out;
			
 
				 
			
 
				-	if (fault_page)
			
 
				+	if (!(ret & VM_FAULT_DAX_LOCKED))
			
 
				 		copy_user_highpage(new_page, fault_page, address, vma);
			
 
				 	__SetPageUptodate(new_page);
			
 
				 
			
 
				 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
			
 
				 	if (unlikely(!pte_same(*pte, orig_pte))) {
			
 
				 		pte_unmap_unlock(pte, ptl);
			
 
				-		if (fault_page) {
			
 
				+		if (!(ret & VM_FAULT_DAX_LOCKED)) {
			
 
				 			unlock_page(fault_page);
			
 
				 			put_page(fault_page);
			
 
				 		} else {
			
 
				-			/*
			
 
				-			 * The fault handler has no page to lock, so it holds
			
 
				-			 * i_mmap_lock for read to protect against truncate.
			
 
				-			 */
			
 
				-			i_mmap_unlock_read(vma->vm_file->f_mapping);
			
 
				+			dax_unlock_mapping_entry(vma->vm_file->f_mapping,
			
 
				+						 pgoff);
			
 
				 		}
			
 
				 		goto uncharge_out;
			
 
				 	}
			
@@ -3115,15 +3115,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	mem_cgroup_commit_charge(new_page, memcg, false, false);
			
 
				 	lru_cache_add_active_or_unevictable(new_page, vma);
			
 
				 	pte_unmap_unlock(pte, ptl);
			
 
				-	if (fault_page) {
			
 
				+	if (!(ret & VM_FAULT_DAX_LOCKED)) {
			
 
				 		unlock_page(fault_page);
			
 
				 		put_page(fault_page);
			
 
				 	} else {
			
 
				-		/*
			
 
				-		 * The fault handler has no page to lock, so it holds
			
 
				-		 * i_mmap_lock for read to protect against truncate.
			
 
				-		 */
			
 
				-		i_mmap_unlock_read(vma->vm_file->f_mapping);
			
 
				+		dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
			
 
				 	}
			
 
				 	return ret;
			
 
				 uncharge_out:
			
@@ -3143,7 +3139,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	int dirtied = 0;
			
 
				 	int ret, tmp;
			
 
				 
			
 
				-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
			
 
				+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
			
 
				 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
			
 
				 		return ret;
			
 
				 
			
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
 
				 	if (shmem_mapping(mapping))
			
 
				 		return;
			
 
				 
			
 
				-	spin_lock_irq(&mapping->tree_lock);
			
 
				-
			
 
				 	if (dax_mapping(mapping)) {
			
 
				-		if (radix_tree_delete_item(&mapping->page_tree, index, entry))
			
 
				-			mapping->nrexceptional--;
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * Regular page slots are stabilized by the page lock even
			
 
				-		 * without the tree itself locked.  These unlocked entries
			
 
				-		 * need verification under the tree lock.
			
 
				-		 */
			
 
				-		if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
			
 
				-					&slot))
			
 
				-			goto unlock;
			
 
				-		if (*slot != entry)
			
 
				-			goto unlock;
			
 
				-		radix_tree_replace_slot(slot, NULL);
			
 
				-		mapping->nrexceptional--;
			
 
				-		if (!node)
			
 
				-			goto unlock;
			
 
				-		workingset_node_shadows_dec(node);
			
 
				-		/*
			
 
				-		 * Don't track node without shadow entries.
			
 
				-		 *
			
 
				-		 * Avoid acquiring the list_lru lock if already untracked.
			
 
				-		 * The list_empty() test is safe as node->private_list is
			
 
				-		 * protected by mapping->tree_lock.
			
 
				-		 */
			
 
				-		if (!workingset_node_shadows(node) &&
			
 
				-		    !list_empty(&node->private_list))
			
 
				-			list_lru_del(&workingset_shadow_nodes,
			
 
				-					&node->private_list);
			
 
				-		__radix_tree_delete_node(&mapping->page_tree, node);
			
 
				+		dax_delete_mapping_entry(mapping, index);
			
 
				+		return;
			
 
				 	}
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	/*
			
 
				+	 * Regular page slots are stabilized by the page lock even
			
 
				+	 * without the tree itself locked.  These unlocked entries
			
 
				+	 * need verification under the tree lock.
			
 
				+	 */
			
 
				+	if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
			
 
				+				&slot))
			
 
				+		goto unlock;
			
 
				+	if (*slot != entry)
			
 
				+		goto unlock;
			
 
				+	radix_tree_replace_slot(slot, NULL);
			
 
				+	mapping->nrexceptional--;
			
 
				+	if (!node)
			
 
				+		goto unlock;
			
 
				+	workingset_node_shadows_dec(node);
			
 
				+	/*
			
 
				+	 * Don't track node without shadow entries.
			
 
				+	 *
			
 
				+	 * Avoid acquiring the list_lru lock if already untracked.
			
 
				+	 * The list_empty() test is safe as node->private_list is
			
 
				+	 * protected by mapping->tree_lock.
			
 
				+	 */
			
 
				+	if (!workingset_node_shadows(node) &&
			
 
				+	    !list_empty(&node->private_list))
			
 
				+		list_lru_del(&workingset_shadow_nodes,
			
 
				+				&node->private_list);
			
 
				+	__radix_tree_delete_node(&mapping->page_tree, node);
			
 
				 unlock:
			
 
				 	spin_unlock_irq(&mapping->tree_lock);
			
 
				 }