浏览代码

Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm fixes from Dan Williams:
 "1/ Fixes to the libnvdimm 'pfn' device that establishes a reserved
     area for storing a struct page array.

  2/ Fixes for dax operations on a raw block device to prevent pagecache
     collisions with dax mappings.

  3/ A fix for pfn_t usage in vm_insert_mixed that lead to a null
     pointer de-reference.

  These have received build success notification from the kbuild robot
  across 153 configs and pass the latest ndctl tests"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  phys_to_pfn_t: use phys_addr_t
  mm: fix pfn_t to page conversion in vm_insert_mixed
  block: use DAX for partition table reads
  block: revert runtime dax control of the raw block device
  fs, block: force direct-I/O for dax-enabled block devices
  devm_memremap_pages: fix vmem_altmap lifetime + alignment handling
  libnvdimm, pfn: fix restoring memmap location
  libnvdimm: fix mode determination for e820 devices
Linus Torvalds 9 年之前
父节点
当前提交
29a8ea4fbe

+ 0 - 38
block/ioctl.c

@@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev)
 
 
 	return true;
 	return true;
 }
 }
-
-static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
-{
-	unsigned long arg;
-	int rc = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (get_user(arg, (int __user *)(argp)))
-		return -EFAULT;
-	arg = !!arg;
-	if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
-		return 0;
-
-	if (arg)
-		arg = S_DAX;
-
-	if (arg && !blkdev_dax_capable(bdev))
-		return -ENOTTY;
-
-	inode_lock(bdev->bd_inode);
-	if (bdev->bd_map_count == 0)
-		inode_set_flags(bdev->bd_inode, arg, S_DAX);
-	else
-		rc = -EBUSY;
-	inode_unlock(bdev->bd_inode);
-	return rc;
-}
-#else
-static int blkdev_daxset(struct block_device *bdev, int arg)
-{
-	if (arg)
-		return -ENOTTY;
-	return 0;
-}
 #endif
 #endif
 
 
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
@@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
 		return blk_trace_ioctl(bdev, cmd, argp);
-	case BLKDAXSET:
-		return blkdev_daxset(bdev, arg);
 	case BLKDAXGET:
 	case BLKDAXGET:
 		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
 		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
 		break;
 		break;

+ 15 - 3
block/partition-generic.c

@@ -16,6 +16,7 @@
 #include <linux/kmod.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
 #include <linux/genhd.h>
+#include <linux/dax.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
 
 
 #include "partitions/check.h"
 #include "partitions/check.h"
@@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 	return 0;
 	return 0;
 }
 }
 
 
-unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
 {
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+			NULL);
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
 	struct page *page;
 	struct page *page;
 
 
-	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-				 NULL);
+	/* don't populate page cache for dax capable devices */
+	if (IS_DAX(bdev->bd_inode))
+		page = read_dax_sector(bdev, n);
+	else
+		page = read_pagecache_sector(bdev, n);
+
 	if (!IS_ERR(page)) {
 	if (!IS_ERR(page)) {
 		if (PageError(page))
 		if (PageError(page))
 			goto fail;
 			goto fail;

+ 5 - 3
drivers/nvdimm/namespace_devs.c

@@ -1277,10 +1277,12 @@ static ssize_t mode_show(struct device *dev,
 
 
 	device_lock(dev);
 	device_lock(dev);
 	claim = ndns->claim;
 	claim = ndns->claim;
-	if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim)))
-		mode = "memory";
-	else if (claim && is_nd_btt(claim))
+	if (claim && is_nd_btt(claim))
 		mode = "safe";
 		mode = "safe";
+	else if (claim && is_nd_pfn(claim))
+		mode = "memory";
+	else if (!claim && pmem_should_map_pages(dev))
+		mode = "memory";
 	else
 	else
 		mode = "raw";
 		mode = "raw";
 	rc = sprintf(buf, "%s\n", mode);
 	rc = sprintf(buf, "%s\n", mode);

+ 1 - 3
drivers/nvdimm/pfn_devs.c

@@ -301,10 +301,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 
 
 	switch (le32_to_cpu(pfn_sb->mode)) {
 	switch (le32_to_cpu(pfn_sb->mode)) {
 	case PFN_MODE_RAM:
 	case PFN_MODE_RAM:
-		break;
 	case PFN_MODE_PMEM:
 	case PFN_MODE_PMEM:
-		/* TODO: allocate from PMEM support */
-		return -ENOTTY;
+		break;
 	default:
 	default:
 		return -ENXIO;
 		return -ENXIO;
 	}
 	}

+ 0 - 28
fs/block_dev.c

@@ -1736,37 +1736,13 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
 	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
 }
 }
 
 
-static void blkdev_vm_open(struct vm_area_struct *vma)
-{
-	struct inode *bd_inode = bdev_file_inode(vma->vm_file);
-	struct block_device *bdev = I_BDEV(bd_inode);
-
-	inode_lock(bd_inode);
-	bdev->bd_map_count++;
-	inode_unlock(bd_inode);
-}
-
-static void blkdev_vm_close(struct vm_area_struct *vma)
-{
-	struct inode *bd_inode = bdev_file_inode(vma->vm_file);
-	struct block_device *bdev = I_BDEV(bd_inode);
-
-	inode_lock(bd_inode);
-	bdev->bd_map_count--;
-	inode_unlock(bd_inode);
-}
-
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
-	.open		= blkdev_vm_open,
-	.close		= blkdev_vm_close,
 	.fault		= blkdev_dax_fault,
 	.fault		= blkdev_dax_fault,
 	.pmd_fault	= blkdev_dax_pmd_fault,
 	.pmd_fault	= blkdev_dax_pmd_fault,
 	.pfn_mkwrite	= blkdev_dax_fault,
 	.pfn_mkwrite	= blkdev_dax_fault,
 };
 };
 
 
 static const struct vm_operations_struct blkdev_default_vm_ops = {
 static const struct vm_operations_struct blkdev_default_vm_ops = {
-	.open		= blkdev_vm_open,
-	.close		= blkdev_vm_close,
 	.fault		= filemap_fault,
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.map_pages	= filemap_map_pages,
 };
 };
@@ -1774,18 +1750,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = {
 static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 {
 {
 	struct inode *bd_inode = bdev_file_inode(file);
 	struct inode *bd_inode = bdev_file_inode(file);
-	struct block_device *bdev = I_BDEV(bd_inode);
 
 
 	file_accessed(file);
 	file_accessed(file);
-	inode_lock(bd_inode);
-	bdev->bd_map_count++;
 	if (IS_DAX(bd_inode)) {
 	if (IS_DAX(bd_inode)) {
 		vma->vm_ops = &blkdev_dax_vm_ops;
 		vma->vm_ops = &blkdev_dax_vm_ops;
 		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
 		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
 	} else {
 	} else {
 		vma->vm_ops = &blkdev_default_vm_ops;
 		vma->vm_ops = &blkdev_default_vm_ops;
 	}
 	}
-	inode_unlock(bd_inode);
 
 
 	return 0;
 	return 0;
 }
 }

+ 20 - 0
fs/dax.c

@@ -58,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev,
 	blk_queue_exit(bdev->bd_queue);
 	blk_queue_exit(bdev->bd_queue);
 }
 }
 
 
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+	struct page *page = alloc_pages(GFP_KERNEL, 0);
+	struct blk_dax_ctl dax = {
+		.size = PAGE_SIZE,
+		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+	};
+	long rc;
+
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	rc = dax_map_atomic(bdev, &dax);
+	if (rc < 0)
+		return ERR_PTR(rc);
+	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+	dax_unmap_atomic(bdev, &dax);
+	return page;
+}
+
 /*
 /*
  * dax_clear_blocks() is called from within transaction context from XFS,
  * dax_clear_blocks() is called from within transaction context from XFS,
  * and hence this means the stack from this point must follow GFP_NOFS
  * and hence this means the stack from this point must follow GFP_NOFS

+ 11 - 0
include/linux/dax.h

@@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 		dax_iodone_t);
 		dax_iodone_t);
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 		dax_iodone_t);
 		dax_iodone_t);
+
+#ifdef CONFIG_FS_DAX
+struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+#else
+static inline struct page *read_dax_sector(struct block_device *bdev,
+		sector_t n)
+{
+	return ERR_PTR(-ENXIO);
+}
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 				unsigned int flags, get_block_t, dax_iodone_t);
 				unsigned int flags, get_block_t, dax_iodone_t);

+ 1 - 4
include/linux/fs.h

@@ -484,9 +484,6 @@ struct block_device {
 	int			bd_fsfreeze_count;
 	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
 	struct mutex		bd_fsfreeze_mutex;
-#ifdef CONFIG_FS_DAX
-	int			bd_map_count;
-#endif
 };
 };
 
 
 /*
 /*
@@ -2907,7 +2904,7 @@ extern void replace_mount_options(struct super_block *sb, char *options);
 
 
 static inline bool io_is_direct(struct file *filp)
 static inline bool io_is_direct(struct file *filp)
 {
 {
-	return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
 }
 }
 
 
 static inline int iocb_flags(struct file *file)
 static inline int iocb_flags(struct file *file)

+ 2 - 2
include/linux/pfn_t.h

@@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
 	return __pfn_to_pfn_t(pfn, 0);
 	return __pfn_to_pfn_t(pfn, 0);
 }
 }
 
 
-extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags);
+extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags);
 
 
 static inline bool pfn_t_has_page(pfn_t pfn)
 static inline bool pfn_t_has_page(pfn_t pfn)
 {
 {
@@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn)
 	return NULL;
 	return NULL;
 }
 }
 
 
-static inline dma_addr_t pfn_t_to_phys(pfn_t pfn)
+static inline phys_addr_t pfn_t_to_phys(pfn_t pfn)
 {
 {
 	return PFN_PHYS(pfn_t_to_pfn(pfn));
 	return PFN_PHYS(pfn_t_to_pfn(pfn));
 }
 }

+ 0 - 1
include/uapi/linux/fs.h

@@ -222,7 +222,6 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 #define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXSET _IO(0x12,128)
 #define BLKDAXGET _IO(0x12,129)
 #define BLKDAXGET _IO(0x12,129)
 
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */

+ 12 - 8
kernel/memremap.c

@@ -150,7 +150,7 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 }
 EXPORT_SYMBOL(devm_memunmap);
 EXPORT_SYMBOL(devm_memunmap);
 
 
-pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
 {
 {
 	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
 	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
 }
 }
@@ -183,7 +183,11 @@ EXPORT_SYMBOL(put_zone_device_page);
 
 
 static void pgmap_radix_release(struct resource *res)
 static void pgmap_radix_release(struct resource *res)
 {
 {
-	resource_size_t key;
+	resource_size_t key, align_start, align_size, align_end;
+
+	align_start = res->start & ~(SECTION_SIZE - 1);
+	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+	align_end = align_start + align_size - 1;
 
 
 	mutex_lock(&pgmap_lock);
 	mutex_lock(&pgmap_lock);
 	for (key = res->start; key <= res->end; key += SECTION_SIZE)
 	for (key = res->start; key <= res->end; key += SECTION_SIZE)
@@ -226,12 +230,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 		percpu_ref_put(pgmap->ref);
 		percpu_ref_put(pgmap->ref);
 	}
 	}
 
 
-	pgmap_radix_release(res);
-
 	/* pages are dead and unused, undo the arch mapping */
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	arch_remove_memory(align_start, align_size);
 	arch_remove_memory(align_start, align_size);
+	pgmap_radix_release(res);
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
 			"%s: failed to free all reserved pages\n", __func__);
 			"%s: failed to free all reserved pages\n", __func__);
 }
 }
@@ -267,7 +270,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 {
 {
 	int is_ram = region_intersects(res->start, resource_size(res),
 	int is_ram = region_intersects(res->start, resource_size(res),
 			"System RAM");
 			"System RAM");
-	resource_size_t key, align_start, align_size;
+	resource_size_t key, align_start, align_size, align_end;
 	struct dev_pagemap *pgmap;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
 	struct page_map *page_map;
 	unsigned long pfn;
 	unsigned long pfn;
@@ -309,7 +312,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 
 
 	mutex_lock(&pgmap_lock);
 	mutex_lock(&pgmap_lock);
 	error = 0;
 	error = 0;
-	for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+	align_start = res->start & ~(SECTION_SIZE - 1);
+	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+	align_end = align_start + align_size - 1;
+	for (key = align_start; key <= align_end; key += SECTION_SIZE) {
 		struct dev_pagemap *dup;
 		struct dev_pagemap *dup;
 
 
 		rcu_read_lock();
 		rcu_read_lock();
@@ -336,8 +342,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (nid < 0)
 	if (nid < 0)
 		nid = numa_mem_id();
 		nid = numa_mem_id();
 
 
-	align_start = res->start & ~(SECTION_SIZE - 1);
-	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	error = arch_add_memory(nid, align_start, align_size, true);
 	error = arch_add_memory(nid, align_start, align_size, true);
 	if (error)
 	if (error)
 		goto err_add_memory;
 		goto err_add_memory;

+ 7 - 2
mm/memory.c

@@ -1591,10 +1591,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
 	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
 	 * without pte special, it would there be refcounted as a normal page.
 	 * without pte special, it would there be refcounted as a normal page.
 	 */
 	 */
-	if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
+	if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
 		struct page *page;
 		struct page *page;
 
 
-		page = pfn_t_to_page(pfn);
+		/*
+		 * At this point we are committed to insert_page()
+		 * regardless of whether the caller specified flags that
+		 * result in pfn_t_has_page() == false.
+		 */
+		page = pfn_to_page(pfn_t_to_pfn(pfn));
 		return insert_page(vma, addr, page, vma->vm_page_prot);
 		return insert_page(vma, addr, page, vma->vm_page_prot);
 	}
 	}
 	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
 	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);

+ 1 - 1
tools/testing/nvdimm/test/iomap.c

@@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
 }
 }
 EXPORT_SYMBOL(__wrap_devm_memremap_pages);
 EXPORT_SYMBOL(__wrap_devm_memremap_pages);
 
 
-pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
 {
 {
 	struct nfit_test_resource *nfit_res = get_nfit_res(addr);
 	struct nfit_test_resource *nfit_res = get_nfit_res(addr);