浏览代码

Merge branch 'akpm' (patches from Andrew)

Merge small final update from Andrew Morton:

 - DAX feature work: add fsync/msync support

 - kfree cleanup, MAINTAINERS update

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  MAINTAINERS: return arch/sh to maintained state, with new maintainers
  tree wide: use kvfree() than conditional kfree()/vfree()
  dax: never rely on bh.b_dev being set by get_block()
  xfs: call dax_pfn_mkwrite() for DAX fsync/msync
  ext4: call dax_pfn_mkwrite() for DAX fsync/msync
  ext2: call dax_pfn_mkwrite() for DAX fsync/msync
  dax: add support for fsync/sync
  mm: add find_get_entries_tag()
  dax: support dirty DAX entries in radix tree
  pmem: add wb_cache_pmem() to the PMEM API
  dax: fix conversion of holes to PMDs
  dax: fix NULL pointer dereference in __dax_dbg()
Linus Torvalds 9 年之前
父节点
当前提交
20c759ca98

+ 3 - 1
MAINTAINERS

@@ -10453,9 +10453,11 @@ S:	Maintained
 F:	drivers/net/ethernet/dlink/sundance.c
 F:	drivers/net/ethernet/dlink/sundance.c
 
 
 SUPERH
 SUPERH
+M:	Yoshinori Sato <ysato@users.sourceforge.jp>
+M:	Rich Felker <dalias@libc.org>
 L:	linux-sh@vger.kernel.org
 L:	linux-sh@vger.kernel.org
 Q:	http://patchwork.kernel.org/project/linux-sh/list/
 Q:	http://patchwork.kernel.org/project/linux-sh/list/
-S:	Orphan
+S:	Maintained
 F:	Documentation/sh/
 F:	Documentation/sh/
 F:	arch/sh/
 F:	arch/sh/
 F:	drivers/sh/
 F:	drivers/sh/

+ 2 - 9
arch/arm/mm/dma-mapping.c

@@ -1200,10 +1200,7 @@ error:
 	while (i--)
 	while (i--)
 		if (pages[i])
 		if (pages[i])
 			__free_pages(pages[i], 0);
 			__free_pages(pages[i], 0);
-	if (array_size <= PAGE_SIZE)
-		kfree(pages);
-	else
-		vfree(pages);
+	kvfree(pages);
 	return NULL;
 	return NULL;
 }
 }
 
 
@@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
 			       size_t size, struct dma_attrs *attrs)
 			       size_t size, struct dma_attrs *attrs)
 {
 {
 	int count = size >> PAGE_SHIFT;
 	int count = size >> PAGE_SHIFT;
-	int array_size = count * sizeof(struct page *);
 	int i;
 	int i;
 
 
 	if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
 	if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
@@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
 				__free_pages(pages[i], 0);
 				__free_pages(pages[i], 0);
 	}
 	}
 
 
-	if (array_size <= PAGE_SIZE)
-		kfree(pages);
-	else
-		vfree(pages);
+	kvfree(pages);
 	return 0;
 	return 0;
 }
 }
 
 

+ 6 - 5
arch/x86/include/asm/pmem.h

@@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void)
 }
 }
 
 
 /**
 /**
- * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * arch_wb_cache_pmem - write back a cache range with CLWB
  * @vaddr:	virtual start address
  * @vaddr:	virtual start address
  * @size:	number of bytes to write back
  * @size:	number of bytes to write back
  *
  *
  * Write back a cache range using the CLWB (cache line write back)
  * Write back a cache range using the CLWB (cache line write back)
  * instruction.  This function requires explicit ordering with an
  * instruction.  This function requires explicit ordering with an
- * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ * arch_wmb_pmem() call.
  */
  */
-static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
 {
 {
 	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
 	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
 	unsigned long clflush_mask = x86_clflush_size - 1;
 	unsigned long clflush_mask = x86_clflush_size - 1;
+	void *vaddr = (void __force *)addr;
 	void *vend = vaddr + size;
 	void *vend = vaddr + size;
 	void *p;
 	void *p;
 
 
@@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
 	len = copy_from_iter_nocache(vaddr, bytes, i);
 	len = copy_from_iter_nocache(vaddr, bytes, i);
 
 
 	if (__iter_needs_pmem_wb(i))
 	if (__iter_needs_pmem_wb(i))
-		__arch_wb_cache_pmem(vaddr, bytes);
+		arch_wb_cache_pmem(addr, bytes);
 
 
 	return len;
 	return len;
 }
 }
@@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 	void *vaddr = (void __force *)addr;
 	void *vaddr = (void __force *)addr;
 
 
 	memset(vaddr, 0, size);
 	memset(vaddr, 0, size);
-	__arch_wb_cache_pmem(vaddr, size);
+	arch_wb_cache_pmem(addr, size);
 }
 }
 
 
 static inline bool __arch_has_wmb_pmem(void)
 static inline bool __arch_has_wmb_pmem(void)

+ 2 - 4
drivers/acpi/apei/erst.c

@@ -32,6 +32,7 @@
 #include <linux/hardirq.h>
 #include <linux/hardirq.h>
 #include <linux/pstore.h>
 #include <linux/pstore.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
+#include <linux/mm.h> /* kvfree() */
 #include <acpi/apei.h>
 #include <acpi/apei.h>
 
 
 #include "apei-internal.h"
 #include "apei-internal.h"
@@ -532,10 +533,7 @@ retry:
 			return -ENOMEM;
 			return -ENOMEM;
 		memcpy(new_entries, entries,
 		memcpy(new_entries, entries,
 		       erst_record_id_cache.len * sizeof(entries[0]));
 		       erst_record_id_cache.len * sizeof(entries[0]));
-		if (erst_record_id_cache.size < PAGE_SIZE)
-			kfree(entries);
-		else
-			vfree(entries);
+		kvfree(entries);
 		erst_record_id_cache.entries = entries = new_entries;
 		erst_record_id_cache.entries = entries = new_entries;
 		erst_record_id_cache.size = new_size;
 		erst_record_id_cache.size = new_size;
 	}
 	}

+ 7 - 19
drivers/block/drbd/drbd_bitmap.c

@@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number)
 	}
 	}
 }
 }
 
 
-static void bm_vk_free(void *ptr, int v)
+static inline void bm_vk_free(void *ptr)
 {
 {
-	if (v)
-		vfree(ptr);
-	else
-		kfree(ptr);
+	kvfree(ptr);
 }
 }
 
 
 /*
 /*
@@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 {
 {
 	struct page **old_pages = b->bm_pages;
 	struct page **old_pages = b->bm_pages;
 	struct page **new_pages, *page;
 	struct page **new_pages, *page;
-	unsigned int i, bytes, vmalloced = 0;
+	unsigned int i, bytes;
 	unsigned long have = b->bm_number_of_pages;
 	unsigned long have = b->bm_number_of_pages;
 
 
 	BUG_ON(have == 0 && old_pages != NULL);
 	BUG_ON(have == 0 && old_pages != NULL);
@@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 				PAGE_KERNEL);
 				PAGE_KERNEL);
 		if (!new_pages)
 		if (!new_pages)
 			return NULL;
 			return NULL;
-		vmalloced = 1;
 	}
 	}
 
 
 	if (want >= have) {
 	if (want >= have) {
@@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
 			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
 			if (!page) {
 			if (!page) {
 				bm_free_pages(new_pages + have, i - have);
 				bm_free_pages(new_pages + have, i - have);
-				bm_vk_free(new_pages, vmalloced);
+				bm_vk_free(new_pages);
 				return NULL;
 				return NULL;
 			}
 			}
 			/* we want to know which page it is
 			/* we want to know which page it is
@@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 		*/
 		*/
 	}
 	}
 
 
-	if (vmalloced)
-		b->bm_flags |= BM_P_VMALLOCED;
-	else
-		b->bm_flags &= ~BM_P_VMALLOCED;
-
 	return new_pages;
 	return new_pages;
 }
 }
 
 
@@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device)
 	if (!expect(device->bitmap))
 	if (!expect(device->bitmap))
 		return;
 		return;
 	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
 	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
-	bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+	bm_vk_free(device->bitmap->bm_pages);
 	kfree(device->bitmap);
 	kfree(device->bitmap);
 	device->bitmap = NULL;
 	device->bitmap = NULL;
 }
 }
@@ -643,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 	unsigned long want, have, onpages; /* number of pages */
 	unsigned long want, have, onpages; /* number of pages */
 	struct page **npages, **opages = NULL;
 	struct page **npages, **opages = NULL;
 	int err = 0, growing;
 	int err = 0, growing;
-	int opages_vmalloced;
 
 
 	if (!expect(b))
 	if (!expect(b))
 		return -ENOMEM;
 		return -ENOMEM;
@@ -656,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 	if (capacity == b->bm_dev_capacity)
 	if (capacity == b->bm_dev_capacity)
 		goto out;
 		goto out;
 
 
-	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
-
 	if (capacity == 0) {
 	if (capacity == 0) {
 		spin_lock_irq(&b->bm_lock);
 		spin_lock_irq(&b->bm_lock);
 		opages = b->bm_pages;
 		opages = b->bm_pages;
@@ -671,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 		b->bm_dev_capacity = 0;
 		b->bm_dev_capacity = 0;
 		spin_unlock_irq(&b->bm_lock);
 		spin_unlock_irq(&b->bm_lock);
 		bm_free_pages(opages, onpages);
 		bm_free_pages(opages, onpages);
-		bm_vk_free(opages, opages_vmalloced);
+		bm_vk_free(opages);
 		goto out;
 		goto out;
 	}
 	}
 	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
 	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
@@ -744,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 
 
 	spin_unlock_irq(&b->bm_lock);
 	spin_unlock_irq(&b->bm_lock);
 	if (opages != npages)
 	if (opages != npages)
-		bm_vk_free(opages, opages_vmalloced);
+		bm_vk_free(opages);
 	if (!growing)
 	if (!growing)
 		b->bm_set = bm_count_bits(b);
 		b->bm_set = bm_count_bits(b);
 	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
 	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);

+ 0 - 3
drivers/block/drbd/drbd_int.h

@@ -536,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */
 /* definition of bits in bm_flags to be used in drbd_bm_lock
 /* definition of bits in bm_flags to be used in drbd_bm_lock
  * and drbd_bitmap_io and friends. */
  * and drbd_bitmap_io and friends. */
 enum bm_flag {
 enum bm_flag {
-	/* do we need to kfree, or vfree bm_pages? */
-	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
-
 	/* currently locked for bulk operation */
 	/* currently locked for bulk operation */
 	BM_LOCKED_MASK = 0xf,
 	BM_LOCKED_MASK = 0xf,
 
 

+ 3 - 12
drivers/char/mspec.c

@@ -93,14 +93,11 @@ struct vma_data {
 	spinlock_t lock;	/* Serialize access to this structure. */
 	spinlock_t lock;	/* Serialize access to this structure. */
 	int count;		/* Number of pages allocated. */
 	int count;		/* Number of pages allocated. */
 	enum mspec_page_type type; /* Type of pages allocated. */
 	enum mspec_page_type type; /* Type of pages allocated. */
-	int flags;		/* See VMD_xxx below. */
 	unsigned long vm_start;	/* Original (unsplit) base. */
 	unsigned long vm_start;	/* Original (unsplit) base. */
 	unsigned long vm_end;	/* Original (unsplit) end. */
 	unsigned long vm_end;	/* Original (unsplit) end. */
 	unsigned long maddr[0];	/* Array of MSPEC addresses. */
 	unsigned long maddr[0];	/* Array of MSPEC addresses. */
 };
 };
 
 
-#define VMD_VMALLOCED 0x1	/* vmalloc'd rather than kmalloc'd */
-
 /* used on shub2 to clear FOP cache in the HUB */
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES	4
 #define SH2_AMO_CACHE_ENTRIES	4
@@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma)
 			       "failed to zero page %ld\n", my_page);
 			       "failed to zero page %ld\n", my_page);
 	}
 	}
 
 
-	if (vdata->flags & VMD_VMALLOCED)
-		vfree(vdata);
-	else
-		kfree(vdata);
+	kvfree(vdata);
 }
 }
 
 
 /*
 /*
@@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
 					enum mspec_page_type type)
 					enum mspec_page_type type)
 {
 {
 	struct vma_data *vdata;
 	struct vma_data *vdata;
-	int pages, vdata_size, flags = 0;
+	int pages, vdata_size;
 
 
 	if (vma->vm_pgoff != 0)
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
 		return -EINVAL;
@@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
 	vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
 	vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
 	if (vdata_size <= PAGE_SIZE)
 	if (vdata_size <= PAGE_SIZE)
 		vdata = kzalloc(vdata_size, GFP_KERNEL);
 		vdata = kzalloc(vdata_size, GFP_KERNEL);
-	else {
+	else
 		vdata = vzalloc(vdata_size);
 		vdata = vzalloc(vdata_size);
-		flags = VMD_VMALLOCED;
-	}
 	if (!vdata)
 	if (!vdata)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	vdata->vm_start = vma->vm_start;
 	vdata->vm_start = vma->vm_start;
 	vdata->vm_end = vma->vm_end;
 	vdata->vm_end = vma->vm_end;
-	vdata->flags = flags;
 	vdata->type = type;
 	vdata->type = type;
 	spin_lock_init(&vdata->lock);
 	spin_lock_init(&vdata->lock);
 	atomic_set(&vdata->refcnt, 1);
 	atomic_set(&vdata->refcnt, 1);

+ 1 - 4
drivers/gpu/drm/drm_hashtab.c

@@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item);
 void drm_ht_remove(struct drm_open_hash *ht)
 void drm_ht_remove(struct drm_open_hash *ht)
 {
 {
 	if (ht->table) {
 	if (ht->table) {
-		if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order)
-			kfree(ht->table);
-		else
-			vfree(ht->table);
+		kvfree(ht->table);
 		ht->table = NULL;
 		ht->table = NULL;
 	}
 	}
 }
 }

+ 2 - 6
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h

@@ -151,16 +151,12 @@ do {									    \
 
 
 #define LIBCFS_FREE(ptr, size)					  \
 #define LIBCFS_FREE(ptr, size)					  \
 do {								    \
 do {								    \
-	int s = (size);						 \
 	if (unlikely((ptr) == NULL)) {				  \
 	if (unlikely((ptr) == NULL)) {				  \
 		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
 		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
-		       "%s:%d\n", s, __FILE__, __LINE__);	       \
+		       "%s:%d\n", (int)(size), __FILE__, __LINE__);	\
 		break;						  \
 		break;						  \
 	}							       \
 	}							       \
-	if (unlikely(s > LIBCFS_VMALLOC_SIZE))			  \
-		vfree(ptr);				    \
-	else							    \
-		kfree(ptr);					  \
+	kvfree(ptr);					  \
 } while (0)
 } while (0)
 
 
 /******************************************************************************/
 /******************************************************************************/

+ 1 - 1
fs/block_dev.c

@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
 {
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
 
-	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
 		return;
 		return;
 
 
 	invalidate_bh_lrus();
 	invalidate_bh_lrus();

+ 1 - 2
fs/coda/coda_linux.h

@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
 } while (0)
 } while (0)
 
 
 
 
-#define CODA_FREE(ptr,size) \
-    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
 
 
 /* inode to cnode access functions */
 /* inode to cnode access functions */
 
 

+ 260 - 14
fs/dax.c

@@ -24,6 +24,7 @@
 #include <linux/memcontrol.h>
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
+#include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
 #include <linux/uio.h>
@@ -245,6 +246,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	loff_t end = pos + iov_iter_count(iter);
 	loff_t end = pos + iov_iter_count(iter);
 
 
 	memset(&bh, 0, sizeof(bh));
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
 		struct address_space *mapping = inode->i_mapping;
@@ -324,6 +326,199 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 	return 0;
 	return 0;
 }
 }
 
 
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+		sector_t sector, bool pmd_entry, bool dirty)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	pgoff_t pmd_index = DAX_PMD_INDEX(index);
+	int type, error = 0;
+	void *entry;
+
+	WARN_ON_ONCE(pmd_entry && !dirty);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	spin_lock_irq(&mapping->tree_lock);
+
+	entry = radix_tree_lookup(page_tree, pmd_index);
+	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+		index = pmd_index;
+		goto dirty;
+	}
+
+	entry = radix_tree_lookup(page_tree, index);
+	if (entry) {
+		type = RADIX_DAX_TYPE(entry);
+		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+					type != RADIX_DAX_PMD)) {
+			error = -EIO;
+			goto unlock;
+		}
+
+		if (!pmd_entry || type == RADIX_DAX_PMD)
+			goto dirty;
+
+		/*
+		 * We only insert dirty PMD entries into the radix tree.  This
+		 * means we don't need to worry about removing a dirty PTE
+		 * entry and inserting a clean PMD entry, thus reducing the
+		 * range we would flush with a follow-up fsync/msync call.
+		 */
+		radix_tree_delete(&mapping->page_tree, index);
+		mapping->nrexceptional--;
+	}
+
+	if (sector == NO_SECTOR) {
+		/*
+		 * This can happen during correct operation if our pfn_mkwrite
+		 * fault raced against a hole punch operation.  If this
+		 * happens the pte that was hole punched will have been
+		 * unmapped and the radix tree entry will have been removed by
+		 * the time we are called, but the call will still happen.  We
+		 * will return all the way up to wp_pfn_shared(), where the
+		 * pte_same() check will fail, eventually causing page fault
+		 * to be retried by the CPU.
+		 */
+		goto unlock;
+	}
+
+	error = radix_tree_insert(page_tree, index,
+			RADIX_DAX_ENTRY(sector, pmd_entry));
+	if (error)
+		goto unlock;
+
+	mapping->nrexceptional++;
+ dirty:
+	if (dirty)
+		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return error;
+}
+
+static int dax_writeback_one(struct block_device *bdev,
+		struct address_space *mapping, pgoff_t index, void *entry)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int type = RADIX_DAX_TYPE(entry);
+	struct radix_tree_node *node;
+	struct blk_dax_ctl dax;
+	void **slot;
+	int ret = 0;
+
+	spin_lock_irq(&mapping->tree_lock);
+	/*
+	 * Regular page slots are stabilized by the page lock even
+	 * without the tree itself locked.  These unlocked entries
+	 * need verification under the tree lock.
+	 */
+	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+		goto unlock;
+	if (*slot != entry)
+		goto unlock;
+
+	/* another fsync thread may have already written back this entry */
+	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+		goto unlock;
+
+	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+		ret = -EIO;
+		goto unlock;
+	}
+
+	dax.sector = RADIX_DAX_SECTOR(entry);
+	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+	spin_unlock_irq(&mapping->tree_lock);
+
+	/*
+	 * We cannot hold tree_lock while calling dax_map_atomic() because it
+	 * eventually calls cond_resched().
+	 */
+	ret = dax_map_atomic(bdev, &dax);
+	if (ret < 0)
+		return ret;
+
+	if (WARN_ON_ONCE(ret < dax.size)) {
+		ret = -EIO;
+		goto unmap;
+	}
+
+	wb_cache_pmem(dax.addr, dax.size);
+
+	spin_lock_irq(&mapping->tree_lock);
+	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+	spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+	dax_unmap_atomic(bdev, &dax);
+	return ret;
+
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+		loff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t indices[PAGEVEC_SIZE];
+	struct pagevec pvec;
+	bool done = false;
+	int i, ret = 0;
+	void *entry;
+
+	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+		return -EIO;
+
+	start_index = start >> PAGE_CACHE_SHIFT;
+	end_index = end >> PAGE_CACHE_SHIFT;
+	pmd_index = DAX_PMD_INDEX(start_index);
+
+	rcu_read_lock();
+	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+	rcu_read_unlock();
+
+	/* see if the start of our range is covered by a PMD entry */
+	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+		start_index = pmd_index;
+
+	tag_pages_for_writeback(mapping, start_index, end_index);
+
+	pagevec_init(&pvec, 0);
+	while (!done) {
+		pvec.nr = find_get_entries_tag(mapping, start_index,
+				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+				pvec.pages, indices);
+
+		if (pvec.nr == 0)
+			break;
+
+		for (i = 0; i < pvec.nr; i++) {
+			if (indices[i] > end_index) {
+				done = true;
+				break;
+			}
+
+			ret = dax_writeback_one(bdev, mapping, indices[i],
+					pvec.pages[i]);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	wmb_pmem();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 {
@@ -363,6 +558,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 	}
 	dax_unmap_atomic(bdev, &dax);
 	dax_unmap_atomic(bdev, &dax);
 
 
+	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+			vmf->flags & FAULT_FLAG_WRITE);
+	if (error)
+		goto out;
+
 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
 
 
  out:
  out:
@@ -408,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 
 	memset(&bh, 0, sizeof(bh));
 	memset(&bh, 0, sizeof(bh));
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 	bh.b_size = PAGE_SIZE;
 
 
  repeat:
  repeat:
@@ -487,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		delete_from_page_cache(page);
 		delete_from_page_cache(page);
 		unlock_page(page);
 		unlock_page(page);
 		page_cache_release(page);
 		page_cache_release(page);
+		page = NULL;
 	}
 	}
 
 
 	/*
 	/*
@@ -590,7 +792,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct block_device *bdev;
 	struct block_device *bdev;
 	pgoff_t size, pgoff;
 	pgoff_t size, pgoff;
 	sector_t block;
 	sector_t block;
-	int result = 0;
+	int error, result = 0;
+	bool alloc = false;
 
 
 	/* dax pmd mappings require pfn_t_devmap() */
 	/* dax pmd mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -624,13 +827,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 	}
 
 
 	memset(&bh, 0, sizeof(bh));
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 
 	bh.b_size = PMD_SIZE;
 	bh.b_size = PMD_SIZE;
-	if (get_block(inode, block, &bh, write) != 0)
+
+	if (get_block(inode, block, &bh, 0) != 0)
 		return VM_FAULT_SIGBUS;
 		return VM_FAULT_SIGBUS;
+
+	if (!buffer_mapped(&bh) && write) {
+		if (get_block(inode, block, &bh, 1) != 0)
+			return VM_FAULT_SIGBUS;
+		alloc = true;
+	}
+
 	bdev = bh.b_bdev;
 	bdev = bh.b_bdev;
-	i_mmap_lock_read(mapping);
 
 
 	/*
 	/*
 	 * If the filesystem isn't willing to tell us the length of a hole,
 	 * If the filesystem isn't willing to tell us the length of a hole,
@@ -639,19 +850,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 */
 	 */
 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
 		dax_pmd_dbg(&bh, address, "allocated block too small");
 		dax_pmd_dbg(&bh, address, "allocated block too small");
-		goto fallback;
+		return VM_FAULT_FALLBACK;
 	}
 	}
 
 
 	/*
 	/*
 	 * If we allocated new storage, make sure no process has any
 	 * If we allocated new storage, make sure no process has any
 	 * zero pages covering this hole
 	 * zero pages covering this hole
 	 */
 	 */
-	if (buffer_new(&bh)) {
-		i_mmap_unlock_read(mapping);
-		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_read(mapping);
+	if (alloc) {
+		loff_t lstart = pgoff << PAGE_SHIFT;
+		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+
+		truncate_pagecache_range(inode, lstart, lend);
 	}
 	}
 
 
+	i_mmap_lock_read(mapping);
+
 	/*
 	/*
 	 * If a truncate happened while we were allocating blocks, we may
 	 * If a truncate happened while we were allocating blocks, we may
 	 * leave blocks allocated to the file that are beyond EOF.  We can't
 	 * leave blocks allocated to the file that are beyond EOF.  We can't
@@ -664,7 +878,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 		goto out;
 	}
 	}
 	if ((pgoff | PG_PMD_COLOUR) >= size) {
 	if ((pgoff | PG_PMD_COLOUR) >= size) {
-		dax_pmd_dbg(&bh, address, "pgoff unaligned");
+		dax_pmd_dbg(&bh, address,
+				"offset + huge page size > file size");
 		goto fallback;
 		goto fallback;
 	}
 	}
 
 
@@ -732,6 +947,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		}
 		}
 		dax_unmap_atomic(bdev, &dax);
 		dax_unmap_atomic(bdev, &dax);
 
 
+		/*
+		 * For PTE faults we insert a radix tree entry for reads, and
+		 * leave it clean.  Then on the first write we dirty the radix
+		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
+		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+		 * call into get_block() to translate the pgoff to a sector in
+		 * order to be able to create a new radix tree entry.
+		 *
+		 * The PMD path doesn't have an equivalent to
+		 * dax_pfn_mkwrite(), though, so for a read followed by a
+		 * write we traverse all the way through __dax_pmd_fault()
+		 * twice.  This means we can just skip inserting a radix tree
+		 * entry completely on the initial read and just wait until
+		 * the write to insert a dirty entry.
+		 */
+		if (write) {
+			error = dax_radix_entry(mapping, pgoff, dax.sector,
+					true, true);
+			if (error) {
+				dax_pmd_dbg(&bh, address,
+						"PMD radix insertion failed");
+				goto fallback;
+			}
+		}
+
 		dev_dbg(part_to_dev(bdev->bd_part),
 		dev_dbg(part_to_dev(bdev->bd_part),
 				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
 				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
 				__func__, current->comm, address,
 				__func__, current->comm, address,
@@ -790,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
  * dax_pfn_mkwrite - handle first write to DAX page
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @vmf: The description of the fault
- *
  */
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 {
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+	struct file *file = vma->vm_file;
 
 
-	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
-	sb_end_pagefault(sb);
+	/*
+	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+	 * RADIX_DAX_PTE entry already exists in the radix tree from a
+	 * previous call to __dax_fault().  We just want to look up that PTE
+	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
+	 * saves us from having to make a call to get_block() here to look
+	 * up the sector.
+	 */
+	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
 	return VM_FAULT_NOPAGE;
 	return VM_FAULT_NOPAGE;
 }
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -835,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 
 	memset(&bh, 0, sizeof(bh));
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_CACHE_SIZE;
 	bh.b_size = PAGE_CACHE_SIZE;
 	err = get_block(inode, index, &bh, 0);
 	err = get_block(inode, index, &bh, 0);
 	if (err < 0)
 	if (err < 0)

+ 3 - 1
fs/ext2/file.c

@@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 {
 {
 	struct inode *inode = file_inode(vma->vm_file);
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret = VM_FAULT_NOPAGE;
 	loff_t size;
 	loff_t size;
+	int ret;
 
 
 	sb_start_pagefault(inode->i_sb);
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
 	file_update_time(vma->vm_file);
@@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vma, vmf);
 
 
 	up_read(&ei->dax_sem);
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
 	sb_end_pagefault(inode->i_sb);

+ 3 - 1
fs/ext4/file.c

@@ -291,8 +291,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
 {
 {
 	struct inode *inode = file_inode(vma->vm_file);
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	struct super_block *sb = inode->i_sb;
-	int ret = VM_FAULT_NOPAGE;
 	loff_t size;
 	loff_t size;
+	int ret;
 
 
 	sb_start_pagefault(sb);
 	sb_start_pagefault(sb);
 	file_update_time(vma->vm_file);
 	file_update_time(vma->vm_file);
@@ -300,6 +300,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vma, vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(sb);
 	sb_end_pagefault(sb);
 
 

+ 1 - 1
fs/inode.c

@@ -495,7 +495,7 @@ void clear_inode(struct inode *inode)
 	 */
 	 */
 	spin_lock_irq(&inode->i_data.tree_lock);
 	spin_lock_irq(&inode->i_data.tree_lock);
 	BUG_ON(inode->i_data.nrpages);
 	BUG_ON(inode->i_data.nrpages);
-	BUG_ON(inode->i_data.nrshadows);
+	BUG_ON(inode->i_data.nrexceptional);
 	spin_unlock_irq(&inode->i_data.tree_lock);
 	spin_unlock_irq(&inode->i_data.tree_lock);
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(!(inode->i_state & I_FREEING));

+ 2 - 6
fs/jffs2/build.c

@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
 #include "nodelist.h"
 #include "nodelist.h"
 
 
 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
 	return 0;
 	return 0;
 
 
  out_free:
  out_free:
-#ifndef __ECOS
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-#endif
-		kfree(c->blocks);
+	kvfree(c->blocks);
 
 
 	return ret;
 	return ret;
 }
 }

+ 1 - 4
fs/jffs2/fs.c

@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 out_root:
 out_root:
 	jffs2_free_ino_caches(c);
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
 	jffs2_free_raw_node_refs(c);
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-		kfree(c->blocks);
+	kvfree(c->blocks);
  out_inohash:
  out_inohash:
 	jffs2_clear_xattr_subsystem(c);
 	jffs2_clear_xattr_subsystem(c);
 	kfree(c->inocache_list);
 	kfree(c->inocache_list);

+ 1 - 4
fs/jffs2/super.c

@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
 
 
 	jffs2_free_ino_caches(c);
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
 	jffs2_free_raw_node_refs(c);
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-		kfree(c->blocks);
+	kvfree(c->blocks);
 	jffs2_flash_cleanup(c);
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
 	kfree(c->inocache_list);
 	jffs2_clear_xattr_subsystem(c);
 	jffs2_clear_xattr_subsystem(c);

+ 1 - 6
fs/udf/super.c

@@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 {
 {
 	int i;
 	int i;
 	int nr_groups = bitmap->s_nr_groups;
 	int nr_groups = bitmap->s_nr_groups;
-	int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
-						nr_groups);
 
 
 	for (i = 0; i < nr_groups; i++)
 	for (i = 0; i < nr_groups; i++)
 		if (bitmap->s_block_bitmap[i])
 		if (bitmap->s_block_bitmap[i])
 			brelse(bitmap->s_block_bitmap[i]);
 			brelse(bitmap->s_block_bitmap[i]);
 
 
-	if (size <= PAGE_SIZE)
-		kfree(bitmap);
-	else
-		vfree(bitmap);
+	kvfree(bitmap);
 }
 }
 
 
 static void udf_free_partition(struct udf_part_map *map)
 static void udf_free_partition(struct udf_part_map *map)

+ 4 - 3
fs/xfs/xfs_file.c

@@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault(
 /*
 /*
  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
  * updates on write faults. In reality, it's need to serialise against
  * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
- * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
- * barrier in place.
+ * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ * to ensure we serialise the fault barrier in place.
  */
  */
 static int
 static int
 xfs_filemap_pfn_mkwrite(
 xfs_filemap_pfn_mkwrite(
@@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite(
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 		ret = VM_FAULT_SIGBUS;
+	else if (IS_DAX(inode))
+		ret = dax_pfn_mkwrite(vma, vmf);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 	return ret;

+ 7 - 0
include/linux/dax.h

@@ -36,4 +36,11 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
 {
 	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
 }
+
+static inline bool dax_mapping(struct address_space *mapping)
+{
+	return mapping->host && IS_DAX(mapping->host);
+}
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+		loff_t end);
 #endif
 #endif

+ 2 - 1
include/linux/fs.h

@@ -433,7 +433,8 @@ struct address_space {
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
 	unsigned long		nrpages;	/* number of total pages */
-	unsigned long		nrshadows;	/* number of shadow entries */
+	/* number of shadow or DAX exceptional entries */
+	unsigned long		nrexceptional;
 	pgoff_t			writeback_index;/* writeback starts here */
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
 	unsigned long		flags;		/* error bits/gfp mask */

+ 3 - 0
include/linux/pagemap.h

@@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 			int tag, unsigned int nr_pages, struct page **pages);
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+			int tag, unsigned int nr_entries,
+			struct page **entries, pgoff_t *indices);
 
 
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
 			pgoff_t index, unsigned flags);
 			pgoff_t index, unsigned flags);

+ 21 - 1
include/linux/pmem.h

@@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 {
 {
 	BUG();
 	BUG();
 }
 }
+
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
+{
+	BUG();
+}
 #endif
 #endif
 
 
 /*
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
  * Architectures that define ARCH_HAS_PMEM_API must provide
  * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
  * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
- * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
+ * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem()
+ * and arch_has_wmb_pmem().
  */
  */
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
 {
 {
@@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size)
 	else
 	else
 		default_clear_pmem(addr, size);
 		default_clear_pmem(addr, size);
 }
 }
+
+/**
+ * wb_cache_pmem - write back processor cache for PMEM memory range
+ * @addr:	virtual start address
+ * @size:	number of bytes to write back
+ *
+ * Write back the processor cache range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline void wb_cache_pmem(void __pmem *addr, size_t size)
+{
+	if (arch_has_pmem_api())
+		arch_wb_cache_pmem(addr, size);
+}
 #endif /* __PMEM_H__ */
 #endif /* __PMEM_H__ */

+ 9 - 0
include/linux/radix-tree.h

@@ -51,6 +51,15 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);

+ 1 - 1
ipc/sem.c

@@ -1493,7 +1493,7 @@ out_rcu_wakeup:
 	wake_up_sem_queue_do(&tasks);
 	wake_up_sem_queue_do(&tasks);
 out_free:
 out_free:
 	if (sem_io != fast_sem_io)
 	if (sem_io != fast_sem_io)
-		ipc_free(sem_io, sizeof(ushort)*nsems);
+		ipc_free(sem_io);
 	return err;
 	return err;
 }
 }
 
 

+ 3 - 8
ipc/util.c

@@ -414,17 +414,12 @@ void *ipc_alloc(int size)
 /**
 /**
  * ipc_free - free ipc space
  * ipc_free - free ipc space
  * @ptr: pointer returned by ipc_alloc
  * @ptr: pointer returned by ipc_alloc
- * @size: size of block
  *
  *
- * Free a block created with ipc_alloc(). The caller must know the size
- * used in the allocation call.
+ * Free a block created with ipc_alloc().
  */
  */
-void ipc_free(void *ptr, int size)
+void ipc_free(void *ptr)
 {
 {
-	if (size > PAGE_SIZE)
-		vfree(ptr);
-	else
-		kfree(ptr);
+	kvfree(ptr);
 }
 }
 
 
 /**
 /**

+ 1 - 1
ipc/util.h

@@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
  * both function can sleep
  * both function can sleep
  */
  */
 void *ipc_alloc(int size);
 void *ipc_alloc(int size);
-void ipc_free(void *ptr, int size);
+void ipc_free(void *ptr);
 
 
 /*
 /*
  * For allocation that need to be freed by RCU.
  * For allocation that need to be freed by RCU.

+ 85 - 6
mm/filemap.c

@@ -11,6 +11,7 @@
  */
  */
 #include <linux/export.h>
 #include <linux/export.h>
 #include <linux/compiler.h>
 #include <linux/compiler.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
 #include <linux/capability.h>
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
 	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
 
 
 	if (shadow) {
 	if (shadow) {
-		mapping->nrshadows++;
+		mapping->nrexceptional++;
 		/*
 		/*
-		 * Make sure the nrshadows update is committed before
+		 * Make sure the nrexceptional update is committed before
 		 * the nrpages update so that final truncate racing
 		 * the nrpages update so that final truncate racing
 		 * with reclaim does not see both counters 0 at the
 		 * with reclaim does not see both counters 0 at the
 		 * same time and miss a shadow entry.
 		 * same time and miss a shadow entry.
@@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 {
 {
 	int err = 0;
 	int err = 0;
 
 
+	if (dax_mapping(mapping) && mapping->nrexceptional) {
+		err = dax_writeback_mapping_range(mapping, lstart, lend);
+		if (err)
+			return err;
+	}
+
 	if (mapping->nrpages) {
 	if (mapping->nrpages) {
 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
 						 WB_SYNC_ALL);
 						 WB_SYNC_ALL);
@@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 		if (!radix_tree_exceptional_entry(p))
 		if (!radix_tree_exceptional_entry(p))
 			return -EEXIST;
 			return -EEXIST;
+
+		if (WARN_ON(dax_mapping(mapping)))
+			return -EINVAL;
+
 		if (shadowp)
 		if (shadowp)
 			*shadowp = p;
 			*shadowp = p;
-		mapping->nrshadows--;
+		mapping->nrexceptional--;
 		if (node)
 		if (node)
 			workingset_node_shadows_dec(node);
 			workingset_node_shadows_dec(node);
 	}
 	}
@@ -1245,9 +1256,9 @@ repeat:
 			if (radix_tree_deref_retry(page))
 			if (radix_tree_deref_retry(page))
 				goto restart;
 				goto restart;
 			/*
 			/*
-			 * A shadow entry of a recently evicted page,
-			 * or a swap entry from shmem/tmpfs.  Return
-			 * it without attempting to raise page count.
+			 * A shadow entry of a recently evicted page, a swap
+			 * entry from shmem/tmpfs or a DAX entry.  Return it
+			 * without attempting to raise page count.
 			 */
 			 */
 			goto export;
 			goto export;
 		}
 		}
@@ -1494,6 +1505,74 @@ repeat:
 }
 }
 EXPORT_SYMBOL(find_get_pages_tag);
 EXPORT_SYMBOL(find_get_pages_tag);
 
 
+/**
+ * find_get_entries_tag - find and return entries that match @tag
+ * @mapping:	the address_space to search
+ * @start:	the starting page cache index
+ * @tag:	the tag index
+ * @nr_entries:	the maximum number of entries
+ * @entries:	where the resulting entries are placed
+ * @indices:	the cache indices corresponding to the entries in @entries
+ *
+ * Like find_get_entries, except we only return entries which are tagged with
+ * @tag.
+ */
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+			int tag, unsigned int nr_entries,
+			struct page **entries, pgoff_t *indices)
+{
+	void **slot;
+	unsigned int ret = 0;
+	struct radix_tree_iter iter;
+
+	if (!nr_entries)
+		return 0;
+
+	rcu_read_lock();
+restart:
+	radix_tree_for_each_tagged(slot, &mapping->page_tree,
+				   &iter, start, tag) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot(slot);
+		if (unlikely(!page))
+			continue;
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page)) {
+				/*
+				 * Transient condition which can only trigger
+				 * when entry at index 0 moves out of or back
+				 * to root: none yet gotten, safe to restart.
+				 */
+				goto restart;
+			}
+
+			/*
+			 * A shadow entry of a recently evicted page, a swap
+			 * entry from shmem/tmpfs or a DAX entry.  Return it
+			 * without attempting to raise page count.
+			 */
+			goto export;
+		}
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *slot)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+export:
+		indices[ret] = iter.index;
+		entries[ret] = page;
+		if (++ret == nr_entries)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(find_get_entries_tag);
+
 /*
 /*
  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  * a _large_ part of the i/o request. Imagine the worst scenario:
  * a _large_ part of the i/o request. Imagine the worst scenario:

+ 7 - 11
mm/percpu.c

@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
 /**
 /**
  * pcpu_mem_free - free memory
  * pcpu_mem_free - free memory
  * @ptr: memory to free
  * @ptr: memory to free
- * @size: size of the area
  *
  *
  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  */
  */
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
 {
 {
-	if (size <= PAGE_SIZE)
-		kfree(ptr);
-	else
-		vfree(ptr);
+	kvfree(ptr);
 }
 }
 
 
 /**
 /**
@@ -463,8 +459,8 @@ out_unlock:
 	 * pcpu_mem_free() might end up calling vfree() which uses
 	 * pcpu_mem_free() might end up calling vfree() which uses
 	 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
 	 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
 	 */
 	 */
-	pcpu_mem_free(old, old_size);
-	pcpu_mem_free(new, new_size);
+	pcpu_mem_free(old);
+	pcpu_mem_free(new);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
 						sizeof(chunk->map[0]));
 						sizeof(chunk->map[0]));
 	if (!chunk->map) {
 	if (!chunk->map) {
-		pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+		pcpu_mem_free(chunk);
 		return NULL;
 		return NULL;
 	}
 	}
 
 
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
 {
 	if (!chunk)
 	if (!chunk)
 		return;
 		return;
-	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
-	pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+	pcpu_mem_free(chunk->map);
+	pcpu_mem_free(chunk);
 }
 }
 
 
 /**
 /**

+ 39 - 30
mm/truncate.c

@@ -9,6 +9,7 @@
 
 
 #include <linux/kernel.h>
 #include <linux/kernel.h>
 #include <linux/backing-dev.h>
 #include <linux/backing-dev.h>
+#include <linux/dax.h>
 #include <linux/gfp.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
 		return;
 		return;
 
 
 	spin_lock_irq(&mapping->tree_lock);
 	spin_lock_irq(&mapping->tree_lock);
-	/*
-	 * Regular page slots are stabilized by the page lock even
-	 * without the tree itself locked.  These unlocked entries
-	 * need verification under the tree lock.
-	 */
-	if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
-		goto unlock;
-	if (*slot != entry)
-		goto unlock;
-	radix_tree_replace_slot(slot, NULL);
-	mapping->nrshadows--;
-	if (!node)
-		goto unlock;
-	workingset_node_shadows_dec(node);
-	/*
-	 * Don't track node without shadow entries.
-	 *
-	 * Avoid acquiring the list_lru lock if already untracked.
-	 * The list_empty() test is safe as node->private_list is
-	 * protected by mapping->tree_lock.
-	 */
-	if (!workingset_node_shadows(node) &&
-	    !list_empty(&node->private_list))
-		list_lru_del(&workingset_shadow_nodes, &node->private_list);
-	__radix_tree_delete_node(&mapping->page_tree, node);
+
+	if (dax_mapping(mapping)) {
+		if (radix_tree_delete_item(&mapping->page_tree, index, entry))
+			mapping->nrexceptional--;
+	} else {
+		/*
+		 * Regular page slots are stabilized by the page lock even
+		 * without the tree itself locked.  These unlocked entries
+		 * need verification under the tree lock.
+		 */
+		if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+					&slot))
+			goto unlock;
+		if (*slot != entry)
+			goto unlock;
+		radix_tree_replace_slot(slot, NULL);
+		mapping->nrexceptional--;
+		if (!node)
+			goto unlock;
+		workingset_node_shadows_dec(node);
+		/*
+		 * Don't track node without shadow entries.
+		 *
+		 * Avoid acquiring the list_lru lock if already untracked.
+		 * The list_empty() test is safe as node->private_list is
+		 * protected by mapping->tree_lock.
+		 */
+		if (!workingset_node_shadows(node) &&
+		    !list_empty(&node->private_list))
+			list_lru_del(&workingset_shadow_nodes,
+					&node->private_list);
+		__radix_tree_delete_node(&mapping->page_tree, node);
+	}
 unlock:
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 	spin_unlock_irq(&mapping->tree_lock);
 }
 }
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	int		i;
 	int		i;
 
 
 	cleancache_invalidate_inode(mapping);
 	cleancache_invalidate_inode(mapping);
-	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
 		return;
 		return;
 
 
 	/* Offsets within partial pages */
 	/* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 void truncate_inode_pages_final(struct address_space *mapping)
 {
 {
-	unsigned long nrshadows;
+	unsigned long nrexceptional;
 	unsigned long nrpages;
 	unsigned long nrpages;
 
 
 	/*
 	/*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
 
 
 	/*
 	/*
 	 * When reclaim installs eviction entries, it increases
 	 * When reclaim installs eviction entries, it increases
-	 * nrshadows first, then decreases nrpages.  Make sure we see
+	 * nrexceptional first, then decreases nrpages.  Make sure we see
 	 * this in the right order or we might miss an entry.
 	 * this in the right order or we might miss an entry.
 	 */
 	 */
 	nrpages = mapping->nrpages;
 	nrpages = mapping->nrpages;
 	smp_rmb();
 	smp_rmb();
-	nrshadows = mapping->nrshadows;
+	nrexceptional = mapping->nrexceptional;
 
 
-	if (nrpages || nrshadows) {
+	if (nrpages || nrexceptional) {
 		/*
 		/*
 		 * As truncation uses a lockless tree lookup, cycle
 		 * As truncation uses a lockless tree lookup, cycle
 		 * the tree lock to make sure any ongoing tree
 		 * the tree lock to make sure any ongoing tree

+ 8 - 1
mm/vmscan.c

@@ -46,6 +46,7 @@
 #include <linux/oom.h>
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <linux/prefetch.h>
 #include <linux/printk.h>
 #include <linux/printk.h>
+#include <linux/dax.h>
 
 
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <asm/div64.h>
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		 * inode reclaim needs to empty out the radix tree or
 		 * inode reclaim needs to empty out the radix tree or
 		 * the nodes are lost.  Don't plant shadows behind its
 		 * the nodes are lost.  Don't plant shadows behind its
 		 * back.
 		 * back.
+		 *
+		 * We also don't store shadows for DAX mappings because the
+		 * only page cache pages found in these are zero pages
+		 * covering holes, and because we don't want to mix DAX
+		 * exceptional entries and shadow exceptional entries in the
+		 * same page_tree.
 		 */
 		 */
 		if (reclaimed && page_is_file_cache(page) &&
 		if (reclaimed && page_is_file_cache(page) &&
-		    !mapping_exiting(mapping))
+		    !mapping_exiting(mapping) && !dax_mapping(mapping))
 			shadow = workingset_eviction(mapping, page);
 			shadow = workingset_eviction(mapping, page);
 		__delete_from_page_cache(page, shadow, memcg);
 		__delete_from_page_cache(page, shadow, memcg);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);

+ 2 - 2
mm/workingset.c

@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 			node->slots[i] = NULL;
 			node->slots[i] = NULL;
 			BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
 			BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
 			node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
 			node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-			BUG_ON(!mapping->nrshadows);
-			mapping->nrshadows--;
+			BUG_ON(!mapping->nrexceptional);
+			mapping->nrexceptional--;
 		}
 		}
 	}
 	}
 	BUG_ON(node->count);
 	BUG_ON(node->count);

+ 1 - 3
net/ipv4/fib_trie.c

@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
 
 
 	if (!n->tn_bits)
 	if (!n->tn_bits)
 		kmem_cache_free(trie_leaf_kmem, n);
 		kmem_cache_free(trie_leaf_kmem, n);
-	else if (n->tn_bits <= TNODE_KMALLOC_MAX)
-		kfree(n);
 	else
 	else
-		vfree(n);
+		kvfree(n);
 }
 }
 
 
 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)