浏览代码

Merge tag 'libnvdimm-for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
 "libnvdimm updates for the latest ACPI and UEFI specifications. This
  pull request also includes new 'struct dax_operations' enabling to
  undo the abuse of copy_user_nocache() for copy operations to pmem.

  The dax work originally missed 4.12 to address concerns raised by Al.

  Summary:

   - Introduce the _flushcache() family of memory copy helpers and use
     them for persistent memory write operations on x86. The
     _flushcache() semantic indicates that the cache is either bypassed
     for the copy operation (movnt) or any lines dirtied by the copy
     operation are written back (clwb, clflushopt, or clflush).

   - Extend dax_operations with ->copy_from_iter() and ->flush()
     operations. These operations and other infrastructure updates allow
     all persistent memory specific dax functionality to be pushed into
     libnvdimm and the pmem driver directly. It also allows dax-specific
     sysfs attributes to be linked to a host device, for example:
     /sys/block/pmem0/dax/write_cache

   - Add support for the new NVDIMM platform/firmware mechanisms
     introduced in ACPI 6.2 and UEFI 2.7. This support includes the v1.2
     namespace label format, extensions to the address-range-scrub
     command set, new error injection commands, and a new BTT
     (block-translation-table) layout. These updates support inter-OS
     and pre-OS compatibility.

   - Fix a longstanding memory corruption bug in nfit_test.

   - Make the pmem and nvdimm-region 'badblocks' sysfs files poll(2)
     capable.

   - Miscellaneous fixes and small updates across libnvdimm and the nfit
     driver.

  Acknowledgements that came after the branch was pushed: commit
  6aa734a2f38e ("libnvdimm, region, pmem: fix 'badblocks'
  sysfs_get_dirent() reference lifetime") was reviewed by Toshi Kani
  <toshi.kani@hpe.com>"

* tag 'libnvdimm-for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (42 commits)
  libnvdimm, namespace: record 'lbasize' for pmem namespaces
  acpi/nfit: Issue Start ARS to retrieve existing records
  libnvdimm: New ACPI 6.2 DSM functions
  acpi, nfit: Show bus_dsm_mask in sysfs
  libnvdimm, acpi, nfit: Add bus level dsm mask for pass thru.
  acpi, nfit: Enable DSM pass thru for root functions.
  libnvdimm: passthru functions clear to send
  libnvdimm, btt: convert some info messages to warn/err
  libnvdimm, region, pmem: fix 'badblocks' sysfs_get_dirent() reference lifetime
  libnvdimm: fix the clear-error check in nsio_rw_bytes
  libnvdimm, btt: fix btt_rw_page not returning errors
  acpi, nfit: quiet invalid block-aperture-region warnings
  libnvdimm, btt: BTT updates for UEFI 2.7 format
  acpi, nfit: constify *_attribute_group
  libnvdimm, pmem: disable dax flushing when pmem is fronting a volatile region
  libnvdimm, pmem, dax: export a cache control attribute
  dax: convert to bitmask for flags
  dax: remove default copy_from_iter fallback
  libnvdimm, nfit: enable support for volatile ranges
  libnvdimm, pmem: fix persistence warning
  ...
Linus Torvalds 8 年之前
父节点
当前提交
b6ffe9ba46
共有 47 个文件被更改,包括 1504 次插入460 次删除
  1. 1 3
      MAINTAINERS
  2. 8 0
      arch/powerpc/sysdev/axonram.c
  3. 1 0
      arch/x86/Kconfig
  4. 0 136
      arch/x86/include/asm/pmem.h
  5. 5 0
      arch/x86/include/asm/string_64.h
  6. 11 0
      arch/x86/include/asm/uaccess_64.h
  7. 134 0
      arch/x86/lib/usercopy_64.c
  8. 6 0
      arch/x86/mm/pageattr.c
  9. 142 25
      drivers/acpi/nfit/core.c
  10. 1 1
      drivers/acpi/nfit/mce.c
  11. 3 1
      drivers/acpi/nfit/nfit.h
  12. 8 0
      drivers/block/brd.c
  13. 112 6
      drivers/dax/super.c
  14. 30 0
      drivers/md/dm-linear.c
  15. 40 0
      drivers/md/dm-stripe.c
  16. 45 0
      drivers/md/dm.c
  17. 30 15
      drivers/nvdimm/btt.c
  18. 2 0
      drivers/nvdimm/btt.h
  19. 49 5
      drivers/nvdimm/btt_devs.c
  20. 10 5
      drivers/nvdimm/bus.c
  21. 33 5
      drivers/nvdimm/claim.c
  22. 4 1
      drivers/nvdimm/core.c
  23. 9 1
      drivers/nvdimm/dax_devs.c
  24. 9 1
      drivers/nvdimm/dimm_devs.c
  25. 224 27
      drivers/nvdimm/label.c
  26. 19 2
      drivers/nvdimm/label.h
  27. 241 41
      drivers/nvdimm/namespace_devs.c
  28. 9 0
      drivers/nvdimm/nd-core.h
  29. 15 2
      drivers/nvdimm/nd.h
  30. 10 2
      drivers/nvdimm/pfn_devs.c
  31. 57 6
      drivers/nvdimm/pmem.c
  32. 15 0
      drivers/nvdimm/pmem.h
  33. 15 2
      drivers/nvdimm/region.c
  34. 64 24
      drivers/nvdimm/region_devs.c
  35. 8 0
      drivers/s390/block/dcssblk.c
  36. 5 4
      fs/dax.c
  37. 12 0
      include/linux/dax.h
  38. 6 0
      include/linux/device-mapper.h
  39. 10 1
      include/linux/libnvdimm.h
  40. 13 0
      include/linux/nd.h
  41. 0 142
      include/linux/pmem.h
  42. 6 0
      include/linux/string.h
  43. 15 0
      include/linux/uio.h
  44. 41 1
      include/uapi/linux/ndctl.h
  45. 3 0
      lib/Kconfig
  46. 22 0
      lib/iov_iter.c
  47. 1 1
      tools/testing/nvdimm/test/nfit.c

+ 1 - 3
MAINTAINERS

@@ -7680,9 +7680,7 @@ M:	Ross Zwisler <ross.zwisler@linux.intel.com>
 L:	linux-nvdimm@lists.01.org
 Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
 S:	Supported
-F:	drivers/nvdimm/pmem.c
-F:	include/linux/pmem.h
-F:	arch/*/include/asm/pmem.h
+F:	drivers/nvdimm/pmem*
 
 LIGHTNVM PLATFORM SUPPORT
 M:	Matias Bjorling <mb@lightnvm.io>

+ 8 - 0
arch/powerpc/sysdev/axonram.c

@@ -45,6 +45,7 @@
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/pfn_t.h>
+#include <linux/uio.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -163,8 +164,15 @@ axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pa
 	return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t axon_ram_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter(addr, bytes, i);
+}
+
 static const struct dax_operations axon_ram_dax_ops = {
 	.direct_access = axon_ram_dax_direct_access,
+	.copy_from_iter = axon_ram_copy_from_iter,
 };
 
 /**

+ 1 - 0
arch/x86/Kconfig

@@ -54,6 +54,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX

+ 0 - 136
arch/x86/include/asm/pmem.h

@@ -1,136 +0,0 @@
-/*
- * Copyright(c) 2015 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef __ASM_X86_PMEM_H__
-#define __ASM_X86_PMEM_H__
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/cpufeature.h>
-#include <asm/special_insns.h>
-
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent pmem driver flush operation will drain posted write queues.
- */
-static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
-{
-	int rem;
-
-	/*
-	 * We are copying between two kernel buffers, if
-	 * __copy_from_user_inatomic_nocache() returns an error (page
-	 * fault) we would have already reported a general protection fault
-	 * before the WARN+BUG.
-	 */
-	rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n);
-	if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n",
-				__func__, dst, src, rem))
-		BUG();
-}
-
-/**
- * arch_wb_cache_pmem - write back a cache range with CLWB
- * @vaddr:	virtual start address
- * @size:	number of bytes to write back
- *
- * Write back a cache range using the CLWB (cache line write back)
- * instruction. Note that @size is internally rounded up to be cache
- * line size aligned.
- */
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
-	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
-	unsigned long clflush_mask = x86_clflush_size - 1;
-	void *vend = addr + size;
-	void *p;
-
-	for (p = (void *)((unsigned long)addr & ~clflush_mask);
-	     p < vend; p += x86_clflush_size)
-		clwb(p);
-}
-
-/**
- * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
- * @addr:	PMEM destination address
- * @bytes:	number of bytes to copy
- * @i:		iterator with source data
- *
- * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
- */
-static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
-		struct iov_iter *i)
-{
-	size_t len;
-
-	/* TODO: skip the write-back by always using non-temporal stores */
-	len = copy_from_iter_nocache(addr, bytes, i);
-
-	/*
-	 * In the iovec case on x86_64 copy_from_iter_nocache() uses
-	 * non-temporal stores for the bulk of the transfer, but we need
-	 * to manually flush if the transfer is unaligned. A cached
-	 * memory copy is used when destination or size is not naturally
-	 * aligned. That is:
-	 *   - Require 8-byte alignment when size is 8 bytes or larger.
-	 *   - Require 4-byte alignment when size is 4 bytes.
-	 *
-	 * In the non-iovec case the entire destination needs to be
-	 * flushed.
-	 */
-	if (iter_is_iovec(i)) {
-		unsigned long flushed, dest = (unsigned long) addr;
-
-		if (bytes < 8) {
-			if (!IS_ALIGNED(dest, 4) || (bytes != 4))
-				arch_wb_cache_pmem(addr, bytes);
-		} else {
-			if (!IS_ALIGNED(dest, 8)) {
-				dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
-				arch_wb_cache_pmem(addr, 1);
-			}
-
-			flushed = dest - (unsigned long) addr;
-			if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8))
-				arch_wb_cache_pmem(addr + bytes - 1, 1);
-		}
-	} else
-		arch_wb_cache_pmem(addr, bytes);
-
-	return len;
-}
-
-/**
- * arch_clear_pmem - zero a PMEM memory range
- * @addr:	virtual start address
- * @size:	number of bytes to zero
- *
- * Write zeros into the memory range starting at 'addr' for 'size' bytes.
- */
-static inline void arch_clear_pmem(void *addr, size_t size)
-{
-	memset(addr, 0, size);
-	arch_wb_cache_pmem(addr, size);
-}
-
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
-	clflush_cache_range(addr, size);
-}
-#endif /* CONFIG_ARCH_HAS_PMEM_API */
-#endif /* __ASM_X86_PMEM_H__ */

+ 5 - 0
arch/x86/include/asm/string_64.h

@@ -109,6 +109,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 	return 0;
 }
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
+void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */

+ 11 - 0
arch/x86/include/asm/uaccess_64.h

@@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
+extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
+extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+			   size_t len);
+
 static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
@@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+static inline int
+__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
+{
+	kasan_check_write(dst, size);
+	return __copy_user_flushcache(dst, src, size);
+}
+
 unsigned long
 copy_user_handle_tail(char *to, char *from, unsigned len);
 

+ 134 - 0
arch/x86/lib/usercopy_64.c

@@ -7,6 +7,7 @@
  */
 #include <linux/export.h>
 #include <linux/uaccess.h>
+#include <linux/highmem.h>
 
 /*
  * Zero Userspace
@@ -73,3 +74,136 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
 	clac();
 	return len;
 }
+
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+/**
+ * clean_cache_range - write back a cache range with CLWB
+ * @vaddr:	virtual start address
+ * @size:	number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction. Note that @size is internally rounded up to be cache
+ * line size aligned.
+ */
+static void clean_cache_range(void *addr, size_t size)
+{
+	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+	unsigned long clflush_mask = x86_clflush_size - 1;
+	void *vend = addr + size;
+	void *p;
+
+	for (p = (void *)((unsigned long)addr & ~clflush_mask);
+	     p < vend; p += x86_clflush_size)
+		clwb(p);
+}
+
+void arch_wb_cache_pmem(void *addr, size_t size)
+{
+	clean_cache_range(addr, size);
+}
+EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
+
+long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
+{
+	unsigned long flushed, dest = (unsigned long) dst;
+	long rc = __copy_user_nocache(dst, src, size, 0);
+
+	/*
+	 * __copy_user_nocache() uses non-temporal stores for the bulk
+	 * of the transfer, but we need to manually flush if the
+	 * transfer is unaligned. A cached memory copy is used when
+	 * destination or size is not naturally aligned. That is:
+	 *   - Require 8-byte alignment when size is 8 bytes or larger.
+	 *   - Require 4-byte alignment when size is 4 bytes.
+	 */
+	if (size < 8) {
+		if (!IS_ALIGNED(dest, 4) || size != 4)
+			clean_cache_range(dst, 1);
+	} else {
+		if (!IS_ALIGNED(dest, 8)) {
+			dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
+			clean_cache_range(dst, 1);
+		}
+
+		flushed = dest - (unsigned long) dst;
+		if (size > flushed && !IS_ALIGNED(size - flushed, 8))
+			clean_cache_range(dst + size - 1, 1);
+	}
+
+	return rc;
+}
+
+void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+{
+	unsigned long dest = (unsigned long) _dst;
+	unsigned long source = (unsigned long) _src;
+
+	/* cache copy and flush to align dest */
+	if (!IS_ALIGNED(dest, 8)) {
+		unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
+
+		memcpy((void *) dest, (void *) source, len);
+		clean_cache_range((void *) dest, len);
+		dest += len;
+		source += len;
+		size -= len;
+		if (!size)
+			return;
+	}
+
+	/* 4x8 movnti loop */
+	while (size >= 32) {
+		asm("movq    (%0), %%r8\n"
+		    "movq   8(%0), %%r9\n"
+		    "movq  16(%0), %%r10\n"
+		    "movq  24(%0), %%r11\n"
+		    "movnti  %%r8,   (%1)\n"
+		    "movnti  %%r9,  8(%1)\n"
+		    "movnti %%r10, 16(%1)\n"
+		    "movnti %%r11, 24(%1)\n"
+		    :: "r" (source), "r" (dest)
+		    : "memory", "r8", "r9", "r10", "r11");
+		dest += 32;
+		source += 32;
+		size -= 32;
+	}
+
+	/* 1x8 movnti loop */
+	while (size >= 8) {
+		asm("movq    (%0), %%r8\n"
+		    "movnti  %%r8,   (%1)\n"
+		    :: "r" (source), "r" (dest)
+		    : "memory", "r8");
+		dest += 8;
+		source += 8;
+		size -= 8;
+	}
+
+	/* 1x4 movnti loop */
+	while (size >= 4) {
+		asm("movl    (%0), %%r8d\n"
+		    "movnti  %%r8d,   (%1)\n"
+		    :: "r" (source), "r" (dest)
+		    : "memory", "r8");
+		dest += 4;
+		source += 4;
+		size -= 4;
+	}
+
+	/* cache copy for remaining bytes */
+	if (size) {
+		memcpy((void *) dest, (void *) source, size);
+		clean_cache_range((void *) dest, size);
+	}
+}
+EXPORT_SYMBOL_GPL(memcpy_flushcache);
+
+void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+		size_t len)
+{
+	char *from = kmap_atomic(page);
+
+	memcpy_flushcache(to, from + offset, len);
+	kunmap_atomic(from);
+}
+#endif

+ 6 - 0
arch/x86/mm/pageattr.c

@@ -150,6 +150,12 @@ void clflush_cache_range(void *vaddr, unsigned int size)
 }
 EXPORT_SYMBOL_GPL(clflush_cache_range);
 
+void arch_invalidate_pmem(void *addr, size_t size)
+{
+	clflush_cache_range(addr, size);
+}
+EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
+
 static void __cpa_flush_all(void *arg)
 {
 	unsigned long cache = (unsigned long)arg;

+ 142 - 25
drivers/acpi/nfit/core.c

@@ -20,7 +20,6 @@
 #include <linux/list.h>
 #include <linux/acpi.h>
 #include <linux/sort.h>
-#include <linux/pmem.h>
 #include <linux/io.h>
 #include <linux/nd.h>
 #include <asm/cacheflush.h>
@@ -253,6 +252,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 		cmd_name = nvdimm_bus_cmd_name(cmd);
 		cmd_mask = nd_desc->cmd_mask;
 		dsm_mask = cmd_mask;
+		if (cmd == ND_CMD_CALL)
+			dsm_mask = nd_desc->bus_dsm_mask;
 		desc = nd_cmd_bus_desc(cmd);
 		guid = to_nfit_uuid(NFIT_DEV_BUS);
 		handle = adev->handle;
@@ -927,6 +928,17 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
 	return 0;
 }
 
+static ssize_t bus_dsm_mask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+
+	return sprintf(buf, "%#lx\n", nd_desc->bus_dsm_mask);
+}
+static struct device_attribute dev_attr_bus_dsm_mask =
+		__ATTR(dsm_mask, 0444, bus_dsm_mask_show, NULL);
+
 static ssize_t revision_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -1031,7 +1043,7 @@ static ssize_t scrub_store(struct device *dev,
 	if (nd_desc) {
 		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 
-		rc = acpi_nfit_ars_rescan(acpi_desc);
+		rc = acpi_nfit_ars_rescan(acpi_desc, 0);
 	}
 	device_unlock(dev);
 	if (rc)
@@ -1063,10 +1075,11 @@ static struct attribute *acpi_nfit_attributes[] = {
 	&dev_attr_revision.attr,
 	&dev_attr_scrub.attr,
 	&dev_attr_hw_error_scrub.attr,
+	&dev_attr_bus_dsm_mask.attr,
 	NULL,
 };
 
-static struct attribute_group acpi_nfit_attribute_group = {
+static const struct attribute_group acpi_nfit_attribute_group = {
 	.name = "nfit",
 	.attrs = acpi_nfit_attributes,
 	.is_visible = nfit_visible,
@@ -1346,7 +1359,7 @@ static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
 	return a->mode;
 }
 
-static struct attribute_group acpi_nfit_dimm_attribute_group = {
+static const struct attribute_group acpi_nfit_dimm_attribute_group = {
 	.name = "nfit",
 	.attrs = acpi_nfit_dimm_attributes,
 	.is_visible = acpi_nfit_dimm_attr_visible,
@@ -1608,11 +1621,23 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			acpi_desc);
 }
 
+/*
+ * These constants are private because there are no kernel consumers of
+ * these commands.
+ */
+enum nfit_aux_cmds {
+        NFIT_CMD_TRANSLATE_SPA = 5,
+        NFIT_CMD_ARS_INJECT_SET = 7,
+        NFIT_CMD_ARS_INJECT_CLEAR = 8,
+        NFIT_CMD_ARS_INJECT_GET = 9,
+};
+
 static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
 	const guid_t *guid = to_nfit_uuid(NFIT_DEV_BUS);
 	struct acpi_device *adev;
+	unsigned long dsm_mask;
 	int i;
 
 	nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
@@ -1623,6 +1648,20 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
 			set_bit(i, &nd_desc->cmd_mask);
+	set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
+
+	dsm_mask =
+		(1 << ND_CMD_ARS_CAP) |
+		(1 << ND_CMD_ARS_START) |
+		(1 << ND_CMD_ARS_STATUS) |
+		(1 << ND_CMD_CLEAR_ERROR) |
+		(1 << NFIT_CMD_TRANSLATE_SPA) |
+		(1 << NFIT_CMD_ARS_INJECT_SET) |
+		(1 << NFIT_CMD_ARS_INJECT_CLEAR) |
+		(1 << NFIT_CMD_ARS_INJECT_GET);
+	for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
+		if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
+			set_bit(i, &nd_desc->bus_dsm_mask);
 }
 
 static ssize_t range_index_show(struct device *dev,
@@ -1640,7 +1679,7 @@ static struct attribute *acpi_nfit_region_attributes[] = {
 	NULL,
 };
 
-static struct attribute_group acpi_nfit_region_attribute_group = {
+static const struct attribute_group acpi_nfit_region_attribute_group = {
 	.name = "nfit",
 	.attrs = acpi_nfit_region_attributes,
 };
@@ -1663,12 +1702,29 @@ struct nfit_set_info {
 	} mapping[0];
 };
 
+struct nfit_set_info2 {
+	struct nfit_set_info_map2 {
+		u64 region_offset;
+		u32 serial_number;
+		u16 vendor_id;
+		u16 manufacturing_date;
+		u8  manufacturing_location;
+		u8  reserved[31];
+	} mapping[0];
+};
+
 static size_t sizeof_nfit_set_info(int num_mappings)
 {
 	return sizeof(struct nfit_set_info)
 		+ num_mappings * sizeof(struct nfit_set_info_map);
 }
 
+static size_t sizeof_nfit_set_info2(int num_mappings)
+{
+	return sizeof(struct nfit_set_info2)
+		+ num_mappings * sizeof(struct nfit_set_info_map2);
+}
+
 static int cmp_map_compat(const void *m0, const void *m1)
 {
 	const struct nfit_set_info_map *map0 = m0;
@@ -1690,6 +1746,18 @@ static int cmp_map(const void *m0, const void *m1)
 	return 0;
 }
 
+static int cmp_map2(const void *m0, const void *m1)
+{
+	const struct nfit_set_info_map2 *map0 = m0;
+	const struct nfit_set_info_map2 *map1 = m1;
+
+	if (map0->region_offset < map1->region_offset)
+		return -1;
+	else if (map0->region_offset > map1->region_offset)
+		return 1;
+	return 0;
+}
+
 /* Retrieve the nth entry referencing this spa */
 static struct acpi_nfit_memory_map *memdev_from_spa(
 		struct acpi_nfit_desc *acpi_desc, u16 range_index, int n)
@@ -1707,27 +1775,31 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 		struct nd_region_desc *ndr_desc,
 		struct acpi_nfit_system_address *spa)
 {
-	int i, spa_type = nfit_spa_type(spa);
 	struct device *dev = acpi_desc->dev;
 	struct nd_interleave_set *nd_set;
 	u16 nr = ndr_desc->num_mappings;
+	struct nfit_set_info2 *info2;
 	struct nfit_set_info *info;
-
-	if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE)
-		/* pass */;
-	else
-		return 0;
+	int i;
 
 	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
 	if (!nd_set)
 		return -ENOMEM;
+	ndr_desc->nd_set = nd_set;
+	guid_copy(&nd_set->type_guid, (guid_t *) spa->range_guid);
 
 	info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
+
+	info2 = devm_kzalloc(dev, sizeof_nfit_set_info2(nr), GFP_KERNEL);
+	if (!info2)
+		return -ENOMEM;
+
 	for (i = 0; i < nr; i++) {
 		struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
 		struct nfit_set_info_map *map = &info->mapping[i];
+		struct nfit_set_info_map2 *map2 = &info2->mapping[i];
 		struct nvdimm *nvdimm = mapping->nvdimm;
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 		struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
@@ -1740,19 +1812,32 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 
 		map->region_offset = memdev->region_offset;
 		map->serial_number = nfit_mem->dcr->serial_number;
+
+		map2->region_offset = memdev->region_offset;
+		map2->serial_number = nfit_mem->dcr->serial_number;
+		map2->vendor_id = nfit_mem->dcr->vendor_id;
+		map2->manufacturing_date = nfit_mem->dcr->manufacturing_date;
+		map2->manufacturing_location = nfit_mem->dcr->manufacturing_location;
 	}
 
+	/* v1.1 namespaces */
 	sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
 			cmp_map, NULL);
-	nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+	nd_set->cookie1 = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+
+	/* v1.2 namespaces */
+	sort(&info2->mapping[0], nr, sizeof(struct nfit_set_info_map2),
+			cmp_map2, NULL);
+	nd_set->cookie2 = nd_fletcher64(info2, sizeof_nfit_set_info2(nr), 0);
 
-	/* support namespaces created with the wrong sort order */
+	/* support v1.1 namespaces created with the wrong sort order */
 	sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
 			cmp_map_compat, NULL);
 	nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
 
 	ndr_desc->nd_set = nd_set;
 	devm_kfree(dev, info);
+	devm_kfree(dev, info2);
 
 	return 0;
 }
@@ -1842,8 +1927,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 		}
 
 		if (rw)
-			memcpy_to_pmem(mmio->addr.aperture + offset,
-					iobuf + copied, c);
+			memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
 		else {
 			if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
 				mmio_flush_range((void __force *)
@@ -1957,7 +2041,7 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
 	nfit_blk->bdw_offset = nfit_mem->bdw->offset;
 	mmio = &nfit_blk->mmio[BDW];
 	mmio->addr.base = devm_nvdimm_memremap(dev, nfit_mem->spa_bdw->address,
-                        nfit_mem->spa_bdw->length, ARCH_MEMREMAP_PMEM);
+                        nfit_mem->spa_bdw->length, nd_blk_memremap_flags(ndbr));
 	if (!mmio->addr.base) {
 		dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
 				nvdimm_name(nvdimm));
@@ -2051,6 +2135,7 @@ static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa
 	memset(&ars_start, 0, sizeof(ars_start));
 	ars_start.address = spa->address;
 	ars_start.length = spa->length;
+	ars_start.flags = acpi_desc->ars_start_flags;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM)
 		ars_start.type = ND_ARS_PERSISTENT;
 	else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE)
@@ -2077,6 +2162,7 @@ static int ars_continue(struct acpi_nfit_desc *acpi_desc)
 	ars_start.address = ars_status->restart_address;
 	ars_start.length = ars_status->restart_length;
 	ars_start.type = ars_status->type;
+	ars_start.flags = acpi_desc->ars_start_flags;
 	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start,
 			sizeof(ars_start), &cmd_rc);
 	if (rc < 0)
@@ -2179,7 +2265,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 	struct acpi_nfit_system_address *spa = nfit_spa->spa;
 	struct nd_blk_region_desc *ndbr_desc;
 	struct nfit_mem *nfit_mem;
-	int blk_valid = 0;
+	int blk_valid = 0, rc;
 
 	if (!nvdimm) {
 		dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n",
@@ -2211,6 +2297,9 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		ndbr_desc = to_blk_region_desc(ndr_desc);
 		ndbr_desc->enable = acpi_nfit_blk_region_enable;
 		ndbr_desc->do_io = acpi_desc->blk_do_io;
+		rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa);
+		if (rc)
+			return rc;
 		nfit_spa->nd_region = nvdimm_blk_region_create(acpi_desc->nvdimm_bus,
 				ndr_desc);
 		if (!nfit_spa->nd_region)
@@ -2229,6 +2318,13 @@ static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa)
 		nfit_spa_type(spa) == NFIT_SPA_PCD);
 }
 
+static bool nfit_spa_is_volatile(struct acpi_nfit_system_address *spa)
+{
+	return (nfit_spa_type(spa) == NFIT_SPA_VDISK ||
+		nfit_spa_type(spa) == NFIT_SPA_VCD   ||
+		nfit_spa_type(spa) == NFIT_SPA_VOLATILE);
+}
+
 static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_spa *nfit_spa)
 {
@@ -2303,7 +2399,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 				ndr_desc);
 		if (!nfit_spa->nd_region)
 			rc = -ENOMEM;
-	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
+	} else if (nfit_spa_is_volatile(spa)) {
 		nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus,
 				ndr_desc);
 		if (!nfit_spa->nd_region)
@@ -2595,6 +2691,7 @@ static void acpi_nfit_scrub(struct work_struct *work)
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
 		acpi_nfit_async_scrub(acpi_desc, nfit_spa);
 	acpi_desc->scrub_count++;
+	acpi_desc->ars_start_flags = 0;
 	if (acpi_desc->scrub_count_state)
 		sysfs_notify_dirent(acpi_desc->scrub_count_state);
 	mutex_unlock(&acpi_desc->init_mutex);
@@ -2613,6 +2710,7 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 				return rc;
 		}
 
+	acpi_desc->ars_start_flags = 0;
 	if (!acpi_desc->cancel)
 		queue_work(nfit_wq, &acpi_desc->work);
 	return 0;
@@ -2817,7 +2915,7 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
 	return 0;
 }
 
-int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, u8 flags)
 {
 	struct device *dev = acpi_desc->dev;
 	struct nfit_spa *nfit_spa;
@@ -2839,6 +2937,7 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
 
 		nfit_spa->ars_required = 1;
 	}
+	acpi_desc->ars_start_flags = flags;
 	queue_work(nfit_wq, &acpi_desc->work);
 	dev_dbg(dev, "%s: ars_scan triggered\n", __func__);
 	mutex_unlock(&acpi_desc->init_mutex);
@@ -2967,7 +3066,7 @@ static int acpi_nfit_remove(struct acpi_device *adev)
 	return 0;
 }
 
-void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
+static void acpi_nfit_update_notify(struct device *dev, acpi_handle handle)
 {
 	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev);
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -2975,11 +3074,6 @@ void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
 	acpi_status status;
 	int ret;
 
-	dev_dbg(dev, "%s: event: %d\n", __func__, event);
-
-	if (event != NFIT_NOTIFY_UPDATE)
-		return;
-
 	if (!dev->driver) {
 		/* dev->driver may be null if we're being removed */
 		dev_dbg(dev, "%s: no driver found for dev\n", __func__);
@@ -3016,6 +3110,29 @@ void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
 		dev_err(dev, "Invalid _FIT\n");
 	kfree(buf.pointer);
 }
+
+static void acpi_nfit_uc_error_notify(struct device *dev, acpi_handle handle)
+{
+	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev);
+	u8 flags = (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) ?
+			0 : ND_ARS_RETURN_PREV_DATA;
+
+	acpi_nfit_ars_rescan(acpi_desc, flags);
+}
+
+void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
+{
+	dev_dbg(dev, "%s: event: 0x%x\n", __func__, event);
+
+	switch (event) {
+	case NFIT_NOTIFY_UPDATE:
+		return acpi_nfit_update_notify(dev, handle);
+	case NFIT_NOTIFY_UC_MEMORY_ERROR:
+		return acpi_nfit_uc_error_notify(dev, handle);
+	default:
+		return;
+	}
+}
 EXPORT_SYMBOL_GPL(__acpi_nfit_notify);
 
 static void acpi_nfit_notify(struct acpi_device *adev, u32 event)

+ 1 - 1
drivers/acpi/nfit/mce.c

@@ -79,7 +79,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 			 * already in progress, just let that be the last
 			 * authoritative one
 			 */
-			acpi_nfit_ars_rescan(acpi_desc);
+			acpi_nfit_ars_rescan(acpi_desc, 0);
 		}
 		break;
 	}

+ 3 - 1
drivers/acpi/nfit/nfit.h

@@ -79,6 +79,7 @@ enum {
 
 enum nfit_root_notifiers {
 	NFIT_NOTIFY_UPDATE = 0x80,
+	NFIT_NOTIFY_UC_MEMORY_ERROR = 0x81,
 };
 
 enum nfit_dimm_notifiers {
@@ -154,6 +155,7 @@ struct acpi_nfit_desc {
 	struct list_head idts;
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
+	u8 ars_start_flags;
 	struct nd_cmd_ars_status *ars_status;
 	size_t ars_status_size;
 	struct work_struct work;
@@ -206,7 +208,7 @@ struct nfit_blk {
 
 extern struct list_head acpi_descs;
 extern struct mutex acpi_desc_lock;
-int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, u8 flags);
 
 #ifdef CONFIG_X86_MCE
 void nfit_mce_register(void);

+ 8 - 0
drivers/block/brd.c

@@ -22,6 +22,7 @@
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 #include <linux/pfn_t.h>
 #include <linux/dax.h>
+#include <linux/uio.h>
 #endif
 
 #include <linux/uaccess.h>
@@ -354,8 +355,15 @@ static long brd_dax_direct_access(struct dax_device *dax_dev,
 	return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter(addr, bytes, i);
+}
+
 static const struct dax_operations brd_dax_ops = {
 	.direct_access = brd_dax_direct_access,
+	.copy_from_iter = brd_dax_copy_from_iter,
 };
 #endif
 

+ 112 - 6
drivers/dax/super.c

@@ -18,6 +18,7 @@
 #include <linux/cdev.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include <linux/uio.h>
 #include <linux/dax.h>
 #include <linux/fs.h>
 
@@ -115,13 +116,20 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 #endif
 
+enum dax_device_flags {
+	/* !alive + rcu grace period == no new operations / mappings */
+	DAXDEV_ALIVE,
+	/* gate whether dax_flush() calls the low level flush routine */
+	DAXDEV_WRITE_CACHE,
+};
+
 /**
  * struct dax_device - anchor object for dax services
  * @inode: core vfs
  * @cdev: optional character interface for "device dax"
  * @host: optional name for lookups where the device path is not available
  * @private: dax driver private data
- * @alive: !alive + rcu grace period == no new operations / mappings
+ * @flags: state and boolean properties
  */
 struct dax_device {
 	struct hlist_node list;
@@ -129,10 +137,75 @@ struct dax_device {
 	struct cdev cdev;
 	const char *host;
 	void *private;
-	bool alive;
+	unsigned long flags;
 	const struct dax_operations *ops;
 };
 
+static ssize_t write_cache_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
+	ssize_t rc;
+
+	WARN_ON_ONCE(!dax_dev);
+	if (!dax_dev)
+		return -ENXIO;
+
+	rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
+				&dax_dev->flags));
+	put_dax(dax_dev);
+	return rc;
+}
+
+static ssize_t write_cache_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	bool write_cache;
+	int rc = strtobool(buf, &write_cache);
+	struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
+
+	WARN_ON_ONCE(!dax_dev);
+	if (!dax_dev)
+		return -ENXIO;
+
+	if (rc)
+		len = rc;
+	else if (write_cache)
+		set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
+	else
+		clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
+
+	put_dax(dax_dev);
+	return len;
+}
+static DEVICE_ATTR_RW(write_cache);
+
+static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
+
+	WARN_ON_ONCE(!dax_dev);
+	if (!dax_dev)
+		return 0;
+
+	if (a == &dev_attr_write_cache.attr && !dax_dev->ops->flush)
+		return 0;
+	return a->mode;
+}
+
+static struct attribute *dax_attributes[] = {
+	&dev_attr_write_cache.attr,
+	NULL,
+};
+
+struct attribute_group dax_attribute_group = {
+	.name = "dax",
+	.attrs = dax_attributes,
+	.is_visible = dax_visible,
+};
+EXPORT_SYMBOL_GPL(dax_attribute_group);
+
 /**
  * dax_direct_access() - translate a device pgoff to an absolute pfn
  * @dax_dev: a dax_device instance representing the logical memory range
@@ -172,10 +245,43 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 }
 EXPORT_SYMBOL_GPL(dax_direct_access);
 
+size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t bytes, struct iov_iter *i)
+{
+	if (!dax_alive(dax_dev))
+		return 0;
+
+	return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
+EXPORT_SYMBOL_GPL(dax_copy_from_iter);
+
+void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	if (!dax_alive(dax_dev))
+		return;
+
+	if (!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))
+		return;
+
+	if (dax_dev->ops->flush)
+		dax_dev->ops->flush(dax_dev, pgoff, addr, size);
+}
+EXPORT_SYMBOL_GPL(dax_flush);
+
+void dax_write_cache(struct dax_device *dax_dev, bool wc)
+{
+	if (wc)
+		set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
+	else
+		clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(dax_write_cache);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
-	return dax_dev->alive;
+	return test_bit(DAXDEV_ALIVE, &dax_dev->flags);
 }
 EXPORT_SYMBOL_GPL(dax_alive);
 
@@ -195,7 +301,7 @@ void kill_dax(struct dax_device *dax_dev)
 	if (!dax_dev)
 		return;
 
-	dax_dev->alive = false;
+	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 
 	synchronize_srcu(&dax_srcu);
 
@@ -239,7 +345,7 @@ static void dax_destroy_inode(struct inode *inode)
 {
 	struct dax_device *dax_dev = to_dax_dev(inode);
 
-	WARN_ONCE(dax_dev->alive,
+	WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags),
 			"kill_dax() must be called before final iput()\n");
 	call_rcu(&inode->i_rcu, dax_i_callback);
 }
@@ -291,7 +397,7 @@ static struct dax_device *dax_dev_get(dev_t devt)
 
 	dax_dev = to_dax_dev(inode);
 	if (inode->i_state & I_NEW) {
-		dax_dev->alive = true;
+		set_bit(DAXDEV_ALIVE, &dax_dev->flags);
 		inode->i_cdev = &dax_dev->cdev;
 		inode->i_mode = S_IFCHR;
 		inode->i_flags = S_DAX;

+ 30 - 0
drivers/md/dm-linear.c

@@ -170,6 +170,34 @@ static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
+static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
+		return;
+	dax_flush(dax_dev, pgoff, addr, size);
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 4, 0},
@@ -183,6 +211,8 @@ static struct target_type linear_target = {
 	.prepare_ioctl = linear_prepare_ioctl,
 	.iterate_devices = linear_iterate_devices,
 	.direct_access = linear_dax_direct_access,
+	.dax_copy_from_iter = linear_dax_copy_from_iter,
+	.dax_flush = linear_dax_flush,
 };
 
 int __init dm_linear_init(void)

+ 40 - 0
drivers/md/dm-stripe.c

@@ -332,6 +332,44 @@ static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+	struct stripe_c *sc = ti->private;
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	uint32_t stripe;
+
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
+static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+	struct stripe_c *sc = ti->private;
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	uint32_t stripe;
+
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
+		return;
+	dax_flush(dax_dev, pgoff, addr, size);
+}
+
 /*
  * Stripe status:
  *
@@ -452,6 +490,8 @@ static struct target_type stripe_target = {
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
 	.direct_access = stripe_dax_direct_access,
+	.dax_copy_from_iter = stripe_dax_copy_from_iter,
+	.dax_flush = stripe_dax_flush,
 };
 
 int __init dm_stripe_init(void)

+ 45 - 0
drivers/md/dm.c

@@ -19,6 +19,7 @@
 #include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/uio.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/wait.h>
@@ -972,6 +973,48 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
+static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	long ret = 0;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (!ti->type->dax_copy_from_iter) {
+		ret = copy_from_iter(addr, bytes, i);
+		goto out;
+	}
+	ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
+ out:
+	dm_put_live_table(md, srcu_idx);
+
+	return ret;
+}
+
+static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (ti->type->dax_flush)
+		ti->type->dax_flush(ti, pgoff, addr, size);
+ out:
+	dm_put_live_table(md, srcu_idx);
+}
+
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH.
@@ -2958,6 +3001,8 @@ static const struct block_device_operations dm_blk_dops = {
 
 static const struct dax_operations dm_dax_ops = {
 	.direct_access = dm_dax_direct_access,
+	.copy_from_iter = dm_dax_copy_from_iter,
+	.flush = dm_dax_flush,
 };
 
 /*

+ 30 - 15
drivers/nvdimm/btt.c

@@ -37,8 +37,8 @@ static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
 	struct nd_btt *nd_btt = arena->nd_btt;
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
-	/* arena offsets are 4K from the base of the device */
-	offset += SZ_4K;
+	/* arena offsets may be shifted from the base of the device */
+	offset += arena->nd_btt->initial_offset;
 	return nvdimm_read_bytes(ndns, offset, buf, n, flags);
 }
 
@@ -48,8 +48,8 @@ static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
 	struct nd_btt *nd_btt = arena->nd_btt;
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
-	/* arena offsets are 4K from the base of the device */
-	offset += SZ_4K;
+	/* arena offsets may be shifted from the base of the device */
+	offset += arena->nd_btt->initial_offset;
 	return nvdimm_write_bytes(ndns, offset, buf, n, flags);
 }
 
@@ -323,7 +323,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
 
 	old_ent = btt_log_get_old(log);
 	if (old_ent < 0 || old_ent > 1) {
-		dev_info(to_dev(arena),
+		dev_err(to_dev(arena),
 				"log corruption (%d): lane %d seq [%d, %d]\n",
 			old_ent, lane, log[0].seq, log[1].seq);
 		/* TODO set error state? */
@@ -576,8 +576,8 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 	arena->internal_lbasize = roundup(arena->external_lbasize,
 					INT_LBASIZE_ALIGNMENT);
 	arena->nfree = BTT_DEFAULT_NFREE;
-	arena->version_major = 1;
-	arena->version_minor = 1;
+	arena->version_major = btt->nd_btt->version_major;
+	arena->version_minor = btt->nd_btt->version_minor;
 
 	if (available % BTT_PG_SIZE)
 		available -= (available % BTT_PG_SIZE);
@@ -684,7 +684,7 @@ static int discover_arenas(struct btt *btt)
 				dev_info(to_dev(arena), "No existing arenas\n");
 				goto out;
 			} else {
-				dev_info(to_dev(arena),
+				dev_err(to_dev(arena),
 						"Found corrupted metadata!\n");
 				ret = -ENODEV;
 				goto out;
@@ -1227,7 +1227,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 		err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
 				  op_is_write(bio_op(bio)), iter.bi_sector);
 		if (err) {
-			dev_info(&btt->nd_btt->dev,
+			dev_err(&btt->nd_btt->dev,
 					"io error in %s sector %lld, len %d,\n",
 					(op_is_write(bio_op(bio))) ? "WRITE" :
 					"READ",
@@ -1248,10 +1248,13 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
 		struct page *page, bool is_write)
 {
 	struct btt *btt = bdev->bd_disk->private_data;
+	int rc;
 
-	btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
-	page_endio(page, is_write, 0);
-	return 0;
+	rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
+	if (rc == 0)
+		page_endio(page, is_write, 0);
+
+	return rc;
 }
 
 
@@ -1369,7 +1372,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 	}
 
 	if (btt->init_state != INIT_READY && nd_region->ro) {
-		dev_info(dev, "%s is read-only, unable to init btt metadata\n",
+		dev_warn(dev, "%s is read-only, unable to init btt metadata\n",
 				dev_name(&nd_region->dev));
 		return NULL;
 	} else if (btt->init_state != INIT_READY) {
@@ -1424,6 +1427,7 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
 {
 	struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
 	struct nd_region *nd_region;
+	struct btt_sb *btt_sb;
 	struct btt *btt;
 	size_t rawsize;
 
@@ -1432,10 +1436,21 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
 		return -ENODEV;
 	}
 
-	rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K;
+	btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
+
+	/*
+	 * If this returns < 0, that is ok as it just means there wasn't
+	 * an existing BTT, and we're creating a new one. We still need to
+	 * call this as we need the version dependent fields in nd_btt to be
+	 * set correctly based on the holder class
+	 */
+	nd_btt_version(nd_btt, ndns, btt_sb);
+
+	rawsize = nvdimm_namespace_capacity(ndns) - nd_btt->initial_offset;
 	if (rawsize < ARENA_MIN_SIZE) {
 		dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
-				dev_name(&ndns->dev), ARENA_MIN_SIZE + SZ_4K);
+				dev_name(&ndns->dev),
+				ARENA_MIN_SIZE + nd_btt->initial_offset);
 		return -ENXIO;
 	}
 	nd_region = to_nd_region(nd_btt->dev.parent);

+ 2 - 0
drivers/nvdimm/btt.h

@@ -184,5 +184,7 @@ struct btt {
 };
 
 bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super);
+int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
+		struct btt_sb *btt_sb);
 
 #endif

+ 49 - 5
drivers/nvdimm/btt_devs.c

@@ -260,20 +260,55 @@ bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super)
 }
 EXPORT_SYMBOL(nd_btt_arena_is_valid);
 
+int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
+		struct btt_sb *btt_sb)
+{
+	if (ndns->claim_class == NVDIMM_CCLASS_BTT2) {
+		/* Probe/setup for BTT v2.0 */
+		nd_btt->initial_offset = 0;
+		nd_btt->version_major = 2;
+		nd_btt->version_minor = 0;
+		if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0))
+			return -ENXIO;
+		if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
+			return -ENODEV;
+		if ((le16_to_cpu(btt_sb->version_major) != 2) ||
+				(le16_to_cpu(btt_sb->version_minor) != 0))
+			return -ENODEV;
+	} else {
+		/*
+		 * Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or
+		 * NVDIMM_CCLASS_BTT)
+		 */
+		nd_btt->initial_offset = SZ_4K;
+		nd_btt->version_major = 1;
+		nd_btt->version_minor = 1;
+		if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
+			return -ENXIO;
+		if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
+			return -ENODEV;
+		if ((le16_to_cpu(btt_sb->version_major) != 1) ||
+				(le16_to_cpu(btt_sb->version_minor) != 1))
+			return -ENODEV;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nd_btt_version);
+
 static int __nd_btt_probe(struct nd_btt *nd_btt,
 		struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
 {
+	int rc;
+
 	if (!btt_sb || !ndns || !nd_btt)
 		return -ENODEV;
 
-	if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
-		return -ENXIO;
-
 	if (nvdimm_namespace_capacity(ndns) < SZ_16M)
 		return -ENXIO;
 
-	if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
-		return -ENODEV;
+	rc = nd_btt_version(nd_btt, ndns, btt_sb);
+	if (rc < 0)
+		return rc;
 
 	nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
 	nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL);
@@ -295,6 +330,15 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
 	if (ndns->force_raw)
 		return -ENODEV;
 
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+	case NVDIMM_CCLASS_BTT:
+	case NVDIMM_CCLASS_BTT2:
+		break;
+	default:
+		return -ENODEV;
+	}
+
 	nvdimm_bus_lock(&ndns->dev);
 	btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns);
 	nvdimm_bus_unlock(&ndns->dev);

+ 10 - 5
drivers/nvdimm/bus.c

@@ -38,13 +38,13 @@ static int to_nd_device_type(struct device *dev)
 {
 	if (is_nvdimm(dev))
 		return ND_DEVICE_DIMM;
-	else if (is_nd_pmem(dev))
+	else if (is_memory(dev))
 		return ND_DEVICE_REGION_PMEM;
 	else if (is_nd_blk(dev))
 		return ND_DEVICE_REGION_BLK;
 	else if (is_nd_dax(dev))
 		return ND_DEVICE_DAX_PMEM;
-	else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
+	else if (is_nd_region(dev->parent))
 		return nd_region_to_nstype(to_nd_region(dev->parent));
 
 	return 0;
@@ -56,7 +56,7 @@ static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 	 * Ensure that region devices always have their numa node set as
 	 * early as possible.
 	 */
-	if (is_nd_pmem(dev) || is_nd_blk(dev))
+	if (is_nd_region(dev))
 		set_dev_node(dev, to_nd_region(dev)->numa_node);
 	return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
 			to_nd_device_type(dev));
@@ -65,7 +65,7 @@ static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 static struct module *to_bus_provider(struct device *dev)
 {
 	/* pin bus providers while regions are enabled */
-	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
+	if (is_nd_region(dev)) {
 		struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
 
 		return nvdimm_bus->nd_desc->module;
@@ -198,6 +198,9 @@ static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
 	sector = (ctx->phys - nd_region->ndr_start) / 512;
 	badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
 
+	if (nd_region->bb_state)
+		sysfs_notify_dirent(nd_region->bb_state);
+
 	return 0;
 }
 
@@ -907,6 +910,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	static char in_env[ND_CMD_MAX_ENVELOPE];
 	const struct nd_cmd_desc *desc = NULL;
 	unsigned int cmd = _IOC_NR(ioctl_cmd);
+	unsigned int func = cmd;
 	void __user *p = (void __user *) arg;
 	struct device *dev = &nvdimm_bus->dev;
 	struct nd_cmd_pkg pkg;
@@ -972,6 +976,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	}
 
 	if (cmd == ND_CMD_CALL) {
+		func = pkg.nd_command;
 		dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
 				__func__, dimm_name, pkg.nd_command,
 				in_len, out_len, buf_len);
@@ -1020,7 +1025,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	}
 
 	nvdimm_bus_lock(&nvdimm_bus->dev);
-	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf);
+	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf);
 	if (rc)
 		goto out_unlock;
 

+ 33 - 5
drivers/nvdimm/claim.c

@@ -12,8 +12,8 @@
  */
 #include <linux/device.h>
 #include <linux/sizes.h>
-#include <linux/pmem.h>
 #include "nd-core.h"
+#include "pmem.h"
 #include "pfn.h"
 #include "btt.h"
 #include "nd.h"
@@ -184,6 +184,35 @@ ssize_t nd_namespace_store(struct device *dev,
 	}
 
 	ndns = to_ndns(found);
+
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+		break;
+	case NVDIMM_CCLASS_BTT:
+	case NVDIMM_CCLASS_BTT2:
+		if (!is_nd_btt(dev)) {
+			len = -EBUSY;
+			goto out_attach;
+		}
+		break;
+	case NVDIMM_CCLASS_PFN:
+		if (!is_nd_pfn(dev)) {
+			len = -EBUSY;
+			goto out_attach;
+		}
+		break;
+	case NVDIMM_CCLASS_DAX:
+		if (!is_nd_dax(dev)) {
+			len = -EBUSY;
+			goto out_attach;
+		}
+		break;
+	default:
+		len = -EBUSY;
+		goto out_attach;
+		break;
+	}
+
 	if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
 		dev_dbg(dev, "%s too small to host\n", name);
 		len = -ENXIO;
@@ -260,8 +289,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 		 * work around this collision.
 		 */
 		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
-				&& !(flags & NVDIMM_IO_ATOMIC)
-				&& !ndns->claim) {
+				&& !(flags & NVDIMM_IO_ATOMIC)) {
 			long cleared;
 
 			cleared = nvdimm_clear_poison(&ndns->dev,
@@ -272,12 +300,12 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 				cleared /= 512;
 				badblocks_clear(&nsio->bb, sector, cleared);
 			}
-			invalidate_pmem(nsio->addr + offset, size);
+			arch_invalidate_pmem(nsio->addr + offset, size);
 		} else
 			rc = -EIO;
 	}
 
-	memcpy_to_pmem(nsio->addr + offset, buf, size);
+	memcpy_flushcache(nsio->addr + offset, buf, size);
 	nvdimm_flush(to_nd_region(ndns->dev.parent));
 
 	return rc;

+ 4 - 1
drivers/nvdimm/core.c

@@ -504,7 +504,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
 	struct nvdimm_bus *nvdimm_bus;
 	struct list_head *poison_list;
 
-	if (!is_nd_pmem(&nd_region->dev)) {
+	if (!is_memory(&nd_region->dev)) {
 		dev_WARN_ONCE(&nd_region->dev, 1,
 				"%s only valid for pmem regions\n", __func__);
 		return;
@@ -699,6 +699,9 @@ static __init int libnvdimm_init(void)
 	rc = nd_region_init();
 	if (rc)
 		goto err_region;
+
+	nd_label_init();
+
 	return 0;
  err_region:
 	nvdimm_exit();

+ 9 - 1
drivers/nvdimm/dax_devs.c

@@ -89,7 +89,7 @@ struct device *nd_dax_create(struct nd_region *nd_region)
 	struct device *dev = NULL;
 	struct nd_dax *nd_dax;
 
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!is_memory(&nd_region->dev))
 		return NULL;
 
 	nd_dax = nd_dax_alloc(nd_region);
@@ -111,6 +111,14 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
 	if (ndns->force_raw)
 		return -ENODEV;
 
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+	case NVDIMM_CCLASS_DAX:
+		break;
+	default:
+		return -ENODEV;
+	}
+
 	nvdimm_bus_lock(&ndns->dev);
 	nd_dax = nd_dax_alloc(nd_region);
 	nd_pfn = &nd_dax->nd_pfn;

+ 9 - 1
drivers/nvdimm/dimm_devs.c

@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include "nd-core.h"
 #include "label.h"
+#include "pmem.h"
 #include "nd.h"
 
 static DEFINE_IDA(dimm_ida);
@@ -235,6 +236,13 @@ struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
 }
 EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
 
+unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr)
+{
+	/* pmem mapping properties are private to libnvdimm */
+	return ARCH_MEMREMAP_PMEM;
+}
+EXPORT_SYMBOL_GPL(nd_blk_memremap_flags);
+
 struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
 {
 	struct nvdimm *nvdimm = nd_mapping->nvdimm;
@@ -411,7 +419,7 @@ int alias_dpa_busy(struct device *dev, void *data)
 	struct resource *res;
 	int i;
 
-	if (!is_nd_pmem(dev))
+	if (!is_memory(dev))
 		return 0;
 
 	nd_region = to_nd_region(dev);

+ 224 - 27
drivers/nvdimm/label.c

@@ -12,6 +12,7 @@
  */
 #include <linux/device.h>
 #include <linux/ndctl.h>
+#include <linux/uuid.h>
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/nd.h>
@@ -19,6 +20,11 @@
 #include "label.h"
 #include "nd.h"
 
+static guid_t nvdimm_btt_guid;
+static guid_t nvdimm_btt2_guid;
+static guid_t nvdimm_pfn_guid;
+static guid_t nvdimm_dax_guid;
+
 static u32 best_seq(u32 a, u32 b)
 {
 	a &= NSINDEX_SEQ_MASK;
@@ -34,6 +40,11 @@ static u32 best_seq(u32 a, u32 b)
 		return a;
 }
 
+unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd)
+{
+	return ndd->nslabel_size;
+}
+
 size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
 {
 	u32 index_span;
@@ -49,7 +60,7 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
 	 * starts to waste space at larger config_sizes, but it's
 	 * unlikely we'll ever see anything but 128K.
 	 */
-	index_span = ndd->nsarea.config_size / 129;
+	index_span = ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
 	index_span /= NSINDEX_ALIGN * 2;
 	ndd->nsindex_size = index_span * NSINDEX_ALIGN;
 
@@ -58,10 +69,10 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
 
 int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
 {
-	return ndd->nsarea.config_size / 129;
+	return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
 }
 
-int nd_label_validate(struct nvdimm_drvdata *ndd)
+static int __nd_label_validate(struct nvdimm_drvdata *ndd)
 {
 	/*
 	 * On media label format consists of two index blocks followed
@@ -104,6 +115,7 @@ int nd_label_validate(struct nvdimm_drvdata *ndd)
 		u32 nslot;
 		u8 sig[NSINDEX_SIG_LEN];
 		u64 sum_save, sum, size;
+		unsigned int version, labelsize;
 
 		memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN);
 		if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) {
@@ -111,6 +123,21 @@ int nd_label_validate(struct nvdimm_drvdata *ndd)
 					__func__, i);
 			continue;
 		}
+
+		/* label sizes larger than 128 arrived with v1.2 */
+		version = __le16_to_cpu(nsindex[i]->major) * 100
+			+ __le16_to_cpu(nsindex[i]->minor);
+		if (version >= 102)
+			labelsize = 1 << (7 + nsindex[i]->labelsize);
+		else
+			labelsize = 128;
+
+		if (labelsize != sizeof_namespace_label(ndd)) {
+			dev_dbg(dev, "%s: nsindex%d labelsize %d invalid\n",
+					__func__, i, nsindex[i]->labelsize);
+			continue;
+		}
+
 		sum_save = __le64_to_cpu(nsindex[i]->checksum);
 		nsindex[i]->checksum = __cpu_to_le64(0);
 		sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1);
@@ -153,7 +180,7 @@ int nd_label_validate(struct nvdimm_drvdata *ndd)
 		}
 
 		nslot = __le32_to_cpu(nsindex[i]->nslot);
-		if (nslot * sizeof(struct nd_namespace_label)
+		if (nslot * sizeof_namespace_label(ndd)
 				+ 2 * sizeof_namespace_index(ndd)
 				> ndd->nsarea.config_size) {
 			dev_dbg(dev, "%s: nsindex%d nslot: %u invalid, config_size: %#x\n",
@@ -189,6 +216,29 @@ int nd_label_validate(struct nvdimm_drvdata *ndd)
 	return -1;
 }
 
+int nd_label_validate(struct nvdimm_drvdata *ndd)
+{
+	/*
+	 * In order to probe for and validate namespace index blocks we
+	 * need to know the size of the labels, and we can't trust the
+	 * size of the labels until we validate the index blocks.
+	 * Resolve this dependency loop by probing for known label
+	 * sizes, but default to v1.2 256-byte namespace labels if
+	 * discovery fails.
+	 */
+	int label_size[] = { 128, 256 };
+	int i, rc;
+
+	for (i = 0; i < ARRAY_SIZE(label_size); i++) {
+		ndd->nslabel_size = label_size[i];
+		rc = __nd_label_validate(ndd);
+		if (rc >= 0)
+			return rc;
+	}
+
+	return -1;
+}
+
 void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
 		struct nd_namespace_index *src)
 {
@@ -210,7 +260,22 @@ static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd)
 static int to_slot(struct nvdimm_drvdata *ndd,
 		struct nd_namespace_label *nd_label)
 {
-	return nd_label - nd_label_base(ndd);
+	unsigned long label, base;
+
+	label = (unsigned long) nd_label;
+	base = (unsigned long) nd_label_base(ndd);
+
+	return (label - base) / sizeof_namespace_label(ndd);
+}
+
+static struct nd_namespace_label *to_label(struct nvdimm_drvdata *ndd, int slot)
+{
+	unsigned long label, base;
+
+	base = (unsigned long) nd_label_base(ndd);
+	label = base + sizeof_namespace_label(ndd) * slot;
+
+	return (struct nd_namespace_label *) label;
 }
 
 #define for_each_clear_bit_le(bit, addr, size) \
@@ -268,7 +333,8 @@ static bool preamble_next(struct nvdimm_drvdata *ndd,
 			free, nslot);
 }
 
-static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot)
+static bool slot_valid(struct nvdimm_drvdata *ndd,
+		struct nd_namespace_label *nd_label, u32 slot)
 {
 	/* check that we are written where we expect to be written */
 	if (slot != __le32_to_cpu(nd_label->slot))
@@ -279,6 +345,21 @@ static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot)
 				| __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
 		return false;
 
+	/* check checksum */
+	if (namespace_label_has(ndd, checksum)) {
+		u64 sum, sum_save;
+
+		sum_save = __le64_to_cpu(nd_label->checksum);
+		nd_label->checksum = __cpu_to_le64(0);
+		sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
+		nd_label->checksum = __cpu_to_le64(sum_save);
+		if (sum != sum_save) {
+			dev_dbg(ndd->dev, "%s fail checksum. slot: %d expect: %#llx\n",
+				__func__, slot, sum);
+			return false;
+		}
+	}
+
 	return true;
 }
 
@@ -299,9 +380,9 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 		struct resource *res;
 		u32 flags;
 
-		nd_label = nd_label_base(ndd) + slot;
+		nd_label = to_label(ndd, slot);
 
-		if (!slot_valid(nd_label, slot))
+		if (!slot_valid(ndd, nd_label, slot))
 			continue;
 
 		memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
@@ -331,9 +412,9 @@ int nd_label_active_count(struct nvdimm_drvdata *ndd)
 	for_each_clear_bit_le(slot, free, nslot) {
 		struct nd_namespace_label *nd_label;
 
-		nd_label = nd_label_base(ndd) + slot;
+		nd_label = to_label(ndd, slot);
 
-		if (!slot_valid(nd_label, slot)) {
+		if (!slot_valid(ndd, nd_label, slot)) {
 			u32 label_slot = __le32_to_cpu(nd_label->slot);
 			u64 size = __le64_to_cpu(nd_label->rawsize);
 			u64 dpa = __le64_to_cpu(nd_label->dpa);
@@ -360,12 +441,12 @@ struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
 	for_each_clear_bit_le(slot, free, nslot) {
 		struct nd_namespace_label *nd_label;
 
-		nd_label = nd_label_base(ndd) + slot;
-		if (!slot_valid(nd_label, slot))
+		nd_label = to_label(ndd, slot);
+		if (!slot_valid(ndd, nd_label, slot))
 			continue;
 
 		if (n-- == 0)
-			return nd_label_base(ndd) + slot;
+			return to_label(ndd, slot);
 	}
 
 	return NULL;
@@ -437,7 +518,8 @@ static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
 		nslot = __le32_to_cpu(nsindex->nslot);
 
 	memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN);
-	nsindex->flags = __cpu_to_le32(0);
+	memset(&nsindex->flags, 0, 3);
+	nsindex->labelsize = sizeof_namespace_label(ndd) >> 8;
 	nsindex->seq = __cpu_to_le32(seq);
 	offset = (unsigned long) nsindex
 		- (unsigned long) to_namespace_index(ndd, 0);
@@ -452,7 +534,10 @@ static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
 	nsindex->labeloff = __cpu_to_le64(offset);
 	nsindex->nslot = __cpu_to_le32(nslot);
 	nsindex->major = __cpu_to_le16(1);
-	nsindex->minor = __cpu_to_le16(1);
+	if (sizeof_namespace_label(ndd) < 256)
+		nsindex->minor = __cpu_to_le16(1);
+	else
+		nsindex->minor = __cpu_to_le16(2);
 	nsindex->checksum = __cpu_to_le64(0);
 	if (flags & ND_NSINDEX_INIT) {
 		unsigned long *free = (unsigned long *) nsindex->free;
@@ -490,11 +575,49 @@ static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
 		- (unsigned long) to_namespace_index(ndd, 0);
 }
 
+enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid)
+{
+	if (guid_equal(guid, &nvdimm_btt_guid))
+		return NVDIMM_CCLASS_BTT;
+	else if (guid_equal(guid, &nvdimm_btt2_guid))
+		return NVDIMM_CCLASS_BTT2;
+	else if (guid_equal(guid, &nvdimm_pfn_guid))
+		return NVDIMM_CCLASS_PFN;
+	else if (guid_equal(guid, &nvdimm_dax_guid))
+		return NVDIMM_CCLASS_DAX;
+	else if (guid_equal(guid, &guid_null))
+		return NVDIMM_CCLASS_NONE;
+
+	return NVDIMM_CCLASS_UNKNOWN;
+}
+
+static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class,
+	guid_t *target)
+{
+	if (claim_class == NVDIMM_CCLASS_BTT)
+		return &nvdimm_btt_guid;
+	else if (claim_class == NVDIMM_CCLASS_BTT2)
+		return &nvdimm_btt2_guid;
+	else if (claim_class == NVDIMM_CCLASS_PFN)
+		return &nvdimm_pfn_guid;
+	else if (claim_class == NVDIMM_CCLASS_DAX)
+		return &nvdimm_dax_guid;
+	else if (claim_class == NVDIMM_CCLASS_UNKNOWN) {
+		/*
+		 * If we're modifying a namespace for which we don't
+		 * know the claim_class, don't touch the existing guid.
+		 */
+		return target;
+	} else
+		return &guid_null;
+}
+
 static int __pmem_label_update(struct nd_region *nd_region,
 		struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
 		int pos)
 {
-	u64 cookie = nd_region_interleave_set_cookie(nd_region);
+	struct nd_namespace_common *ndns = &nspm->nsio.common;
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	struct nd_label_ent *label_ent, *victim = NULL;
 	struct nd_namespace_label *nd_label;
@@ -504,11 +627,13 @@ static int __pmem_label_update(struct nd_region *nd_region,
 	unsigned long *free;
 	u32 nslot, slot;
 	size_t offset;
+	u64 cookie;
 	int rc;
 
 	if (!preamble_next(ndd, &nsindex, &free, &nslot))
 		return -ENXIO;
 
+	cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
 	nd_label_gen_id(&label_id, nspm->uuid, 0);
 	for_each_dpa_resource(ndd, res)
 		if (strcmp(res->name, label_id.id) == 0)
@@ -525,8 +650,8 @@ static int __pmem_label_update(struct nd_region *nd_region,
 		return -ENXIO;
 	dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot);
 
-	nd_label = nd_label_base(ndd) + slot;
-	memset(nd_label, 0, sizeof(struct nd_namespace_label));
+	nd_label = to_label(ndd, slot);
+	memset(nd_label, 0, sizeof_namespace_label(ndd));
 	memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN);
 	if (nspm->alt_name)
 		memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN);
@@ -535,14 +660,28 @@ static int __pmem_label_update(struct nd_region *nd_region,
 	nd_label->position = __cpu_to_le16(pos);
 	nd_label->isetcookie = __cpu_to_le64(cookie);
 	nd_label->rawsize = __cpu_to_le64(resource_size(res));
+	nd_label->lbasize = __cpu_to_le64(nspm->lbasize);
 	nd_label->dpa = __cpu_to_le64(res->start);
 	nd_label->slot = __cpu_to_le32(slot);
+	if (namespace_label_has(ndd, type_guid))
+		guid_copy(&nd_label->type_guid, &nd_set->type_guid);
+	if (namespace_label_has(ndd, abstraction_guid))
+		guid_copy(&nd_label->abstraction_guid,
+				to_abstraction_guid(ndns->claim_class,
+					&nd_label->abstraction_guid));
+	if (namespace_label_has(ndd, checksum)) {
+		u64 sum;
+
+		nd_label->checksum = __cpu_to_le64(0);
+		sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
+		nd_label->checksum = __cpu_to_le64(sum);
+	}
 	nd_dbg_dpa(nd_region, ndd, res, "%s\n", __func__);
 
 	/* update label */
 	offset = nd_label_offset(ndd, nd_label);
 	rc = nvdimm_set_config_data(ndd, offset, nd_label,
-			sizeof(struct nd_namespace_label));
+			sizeof_namespace_label(ndd));
 	if (rc < 0)
 		return rc;
 
@@ -624,6 +763,8 @@ static int __blk_label_update(struct nd_region *nd_region,
 		int num_labels)
 {
 	int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO;
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+	struct nd_namespace_common *ndns = &nsblk->common;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	struct nd_namespace_label *nd_label;
 	struct nd_label_ent *label_ent, *e;
@@ -632,6 +773,7 @@ static int __blk_label_update(struct nd_region *nd_region,
 	struct resource *res, **old_res_list;
 	struct nd_label_id label_id;
 	u8 uuid[NSLABEL_UUID_LEN];
+	int min_dpa_idx = 0;
 	LIST_HEAD(list);
 	u32 nslot, slot;
 
@@ -668,7 +810,7 @@ static int __blk_label_update(struct nd_region *nd_region,
 
 		/* mark unused labels for garbage collection */
 		for_each_clear_bit_le(slot, free, nslot) {
-			nd_label = nd_label_base(ndd) + slot;
+			nd_label = to_label(ndd, slot);
 			memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
 			if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
 				continue;
@@ -703,6 +845,18 @@ static int __blk_label_update(struct nd_region *nd_region,
 		}
 	}
 
+	/*
+	 * Find the resource associated with the first label in the set
+	 * per the v1.2 namespace specification.
+	 */
+	for (i = 0; i < nsblk->num_resources; i++) {
+		struct resource *min = nsblk->res[min_dpa_idx];
+
+		res = nsblk->res[i];
+		if (res->start < min->start)
+			min_dpa_idx = i;
+	}
+
 	for (i = 0; i < nsblk->num_resources; i++) {
 		size_t offset;
 
@@ -714,25 +868,58 @@ static int __blk_label_update(struct nd_region *nd_region,
 			goto abort;
 		dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot);
 
-		nd_label = nd_label_base(ndd) + slot;
-		memset(nd_label, 0, sizeof(struct nd_namespace_label));
+		nd_label = to_label(ndd, slot);
+		memset(nd_label, 0, sizeof_namespace_label(ndd));
 		memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN);
 		if (nsblk->alt_name)
 			memcpy(nd_label->name, nsblk->alt_name,
 					NSLABEL_NAME_LEN);
 		nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL);
-		nd_label->nlabel = __cpu_to_le16(0); /* N/A */
-		nd_label->position = __cpu_to_le16(0); /* N/A */
-		nd_label->isetcookie = __cpu_to_le64(0); /* N/A */
+
+		/*
+		 * Use the presence of the type_guid as a flag to
+		 * determine isetcookie usage and nlabel + position
+		 * policy for blk-aperture namespaces.
+		 */
+		if (namespace_label_has(ndd, type_guid)) {
+			if (i == min_dpa_idx) {
+				nd_label->nlabel = __cpu_to_le16(nsblk->num_resources);
+				nd_label->position = __cpu_to_le16(0);
+			} else {
+				nd_label->nlabel = __cpu_to_le16(0xffff);
+				nd_label->position = __cpu_to_le16(0xffff);
+			}
+			nd_label->isetcookie = __cpu_to_le64(nd_set->cookie2);
+		} else {
+			nd_label->nlabel = __cpu_to_le16(0); /* N/A */
+			nd_label->position = __cpu_to_le16(0); /* N/A */
+			nd_label->isetcookie = __cpu_to_le64(0); /* N/A */
+		}
+
 		nd_label->dpa = __cpu_to_le64(res->start);
 		nd_label->rawsize = __cpu_to_le64(resource_size(res));
 		nd_label->lbasize = __cpu_to_le64(nsblk->lbasize);
 		nd_label->slot = __cpu_to_le32(slot);
+		if (namespace_label_has(ndd, type_guid))
+			guid_copy(&nd_label->type_guid, &nd_set->type_guid);
+		if (namespace_label_has(ndd, abstraction_guid))
+			guid_copy(&nd_label->abstraction_guid,
+					to_abstraction_guid(ndns->claim_class,
+						&nd_label->abstraction_guid));
+
+		if (namespace_label_has(ndd, checksum)) {
+			u64 sum;
+
+			nd_label->checksum = __cpu_to_le64(0);
+			sum = nd_fletcher64(nd_label,
+					sizeof_namespace_label(ndd), 1);
+			nd_label->checksum = __cpu_to_le64(sum);
+		}
 
 		/* update label */
 		offset = nd_label_offset(ndd, nd_label);
 		rc = nvdimm_set_config_data(ndd, offset, nd_label,
-				sizeof(struct nd_namespace_label));
+				sizeof_namespace_label(ndd));
 		if (rc < 0)
 			goto abort;
 	}
@@ -790,7 +977,7 @@ static int __blk_label_update(struct nd_region *nd_region,
 		goto out;
 	}
 	for_each_clear_bit_le(slot, free, nslot) {
-		nd_label = nd_label_base(ndd) + slot;
+		nd_label = to_label(ndd, slot);
 		memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
 		if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
 			continue;
@@ -973,3 +1160,13 @@ int nd_blk_namespace_label_update(struct nd_region *nd_region,
 
 	return __blk_label_update(nd_region, nd_mapping, nsblk, count);
 }
+
+int __init nd_label_init(void)
+{
+	WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid));
+	WARN_ON(guid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_guid));
+	WARN_ON(guid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_guid));
+	WARN_ON(guid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_guid));
+
+	return 0;
+}

+ 19 - 2
drivers/nvdimm/label.h

@@ -15,6 +15,7 @@
 
 #include <linux/ndctl.h>
 #include <linux/sizes.h>
+#include <linux/uuid.h>
 #include <linux/io.h>
 
 enum {
@@ -60,7 +61,8 @@ static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0";
  */
 struct nd_namespace_index {
 	u8 sig[NSINDEX_SIG_LEN];
-	__le32 flags;
+	u8 flags[3];
+	u8 labelsize;
 	__le32 seq;
 	__le64 myoff;
 	__le64 mysize;
@@ -98,9 +100,23 @@ struct nd_namespace_label {
 	__le64 dpa;
 	__le64 rawsize;
 	__le32 slot;
-	__le32 unused;
+	/*
+	 * Accessing fields past this point should be gated by a
+	 * namespace_label_has() check.
+	 */
+	u8 align;
+	u8 reserved[3];
+	guid_t type_guid;
+	guid_t abstraction_guid;
+	u8 reserved2[88];
+	__le64 checksum;
 };
 
+#define NVDIMM_BTT_GUID "8aed63a2-29a2-4c66-8b12-f05d15d3922a"
+#define NVDIMM_BTT2_GUID "18633bfc-1735-4217-8ac9-17239282d3f8"
+#define NVDIMM_PFN_GUID "266400ba-fb9f-4677-bcb0-968f11d0d225"
+#define NVDIMM_DAX_GUID "97a86d9c-3cdd-4eda-986f-5068b4f80088"
+
 /**
  * struct nd_label_id - identifier string for dpa allocation
  * @id: "{blk|pmem}-<namespace uuid>"
@@ -131,6 +147,7 @@ struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n);
 u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd);
 bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot);
 u32 nd_label_nfree(struct nvdimm_drvdata *ndd);
+enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid);
 struct nd_region;
 struct nd_namespace_pmem;
 struct nd_namespace_blk;

+ 241 - 41
drivers/nvdimm/namespace_devs.c

@@ -14,10 +14,10 @@
 #include <linux/device.h>
 #include <linux/sort.h>
 #include <linux/slab.h>
-#include <linux/pmem.h>
 #include <linux/list.h>
 #include <linux/nd.h>
 #include "nd-core.h"
+#include "pmem.h"
 #include "nd.h"
 
 static void namespace_io_release(struct device *dev)
@@ -112,7 +112,7 @@ static int is_uuid_busy(struct device *dev, void *data)
 
 static int is_namespace_uuid_busy(struct device *dev, void *data)
 {
-	if (is_nd_pmem(dev) || is_nd_blk(dev))
+	if (is_nd_region(dev))
 		return device_for_each_child(dev, data, is_uuid_busy);
 	return 0;
 }
@@ -155,14 +155,33 @@ bool pmem_should_map_pages(struct device *dev)
 				IORES_DESC_NONE) == REGION_MIXED)
 		return false;
 
-#ifdef ARCH_MEMREMAP_PMEM
 	return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
-#else
-	return false;
-#endif
 }
 EXPORT_SYMBOL(pmem_should_map_pages);
 
+unsigned int pmem_sector_size(struct nd_namespace_common *ndns)
+{
+	if (is_namespace_pmem(&ndns->dev)) {
+		struct nd_namespace_pmem *nspm;
+
+		nspm = to_nd_namespace_pmem(&ndns->dev);
+		if (nspm->lbasize == 0 || nspm->lbasize == 512)
+			/* default */;
+		else if (nspm->lbasize == 4096)
+			return 4096;
+		else
+			dev_WARN(&ndns->dev, "unsupported sector size: %ld\n",
+					nspm->lbasize);
+	}
+
+	/*
+	 * There is no namespace label (is_namespace_io()), or the label
+	 * indicates the default sector size.
+	 */
+	return 512;
+}
+EXPORT_SYMBOL(pmem_sector_size);
+
 const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name)
 {
@@ -787,7 +806,7 @@ static int __reserve_free_pmem(struct device *dev, void *data)
 	struct nd_label_id label_id;
 	int i;
 
-	if (!is_nd_pmem(dev))
+	if (!is_memory(dev))
 		return 0;
 
 	nd_region = to_nd_region(dev);
@@ -1283,28 +1302,49 @@ static ssize_t resource_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(resource);
 
-static const unsigned long ns_lbasize_supported[] = { 512, 520, 528,
+static const unsigned long blk_lbasize_supported[] = { 512, 520, 528,
 	4096, 4104, 4160, 4224, 0 };
 
+static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 };
+
 static ssize_t sector_size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+	if (is_namespace_blk(dev)) {
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
 
-	if (!is_namespace_blk(dev))
-		return -ENXIO;
+		return nd_sector_size_show(nsblk->lbasize,
+				blk_lbasize_supported, buf);
+	}
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
-	return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf);
+		return nd_sector_size_show(nspm->lbasize,
+				pmem_lbasize_supported, buf);
+	}
+	return -ENXIO;
 }
 
 static ssize_t sector_size_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
 	struct nd_region *nd_region = to_nd_region(dev->parent);
+	const unsigned long *supported;
+	unsigned long *lbasize;
 	ssize_t rc = 0;
 
-	if (!is_namespace_blk(dev))
+	if (is_namespace_blk(dev)) {
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		lbasize = &nsblk->lbasize;
+		supported = blk_lbasize_supported;
+	} else if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		lbasize = &nspm->lbasize;
+		supported = pmem_lbasize_supported;
+	} else
 		return -ENXIO;
 
 	device_lock(dev);
@@ -1312,8 +1352,7 @@ static ssize_t sector_size_store(struct device *dev,
 	if (to_ndns(dev)->claim)
 		rc = -EBUSY;
 	if (rc >= 0)
-		rc = nd_sector_size_store(dev, buf, &nsblk->lbasize,
-				ns_lbasize_supported);
+		rc = nd_sector_size_store(dev, buf, lbasize, supported);
 	if (rc >= 0)
 		rc = nd_namespace_label_update(nd_region, dev);
 	dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__,
@@ -1368,6 +1407,58 @@ static ssize_t dpa_extents_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(dpa_extents);
 
+static int btt_claim_class(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	int i, loop_bitmask = 0;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nd_namespace_index *nsindex;
+
+		nsindex = to_namespace_index(ndd, ndd->ns_current);
+		if (nsindex == NULL)
+			loop_bitmask |= 1;
+		else {
+			/* check whether existing labels are v1.1 or v1.2 */
+			if (__le16_to_cpu(nsindex->major) == 1
+					&& __le16_to_cpu(nsindex->minor) == 1)
+				loop_bitmask |= 2;
+			else
+				loop_bitmask |= 4;
+		}
+	}
+	/*
+	 * If nsindex is null loop_bitmask's bit 0 will be set, and if an index
+	 * block is found, a v1.1 label for any mapping will set bit 1, and a
+	 * v1.2 label will set bit 2.
+	 *
+	 * At the end of the loop, at most one of the three bits must be set.
+	 * If multiple bits were set, it means the different mappings disagree
+	 * about their labels, and this must be cleaned up first.
+	 *
+	 * If all the label index blocks are found to agree, nsindex of NULL
+	 * implies labels haven't been initialized yet, and when they will,
+	 * they will be of the 1.2 format, so we can assume BTT2.0
+	 *
+	 * If 1.1 labels are found, we enforce BTT1.1, and if 1.2 labels are
+	 * found, we enforce BTT2.0
+	 *
+	 * If the loop was never entered, default to BTT1.1 (legacy namespaces)
+	 */
+	switch (loop_bitmask) {
+	case 0:
+	case 2:
+		return NVDIMM_CCLASS_BTT;
+	case 1:
+	case 4:
+		return NVDIMM_CCLASS_BTT2;
+	default:
+		return -ENXIO;
+	}
+}
+
 static ssize_t holder_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -1382,6 +1473,74 @@ static ssize_t holder_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(holder);
 
+static ssize_t __holder_class_store(struct device *dev, const char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+
+	if (dev->driver || ndns->claim)
+		return -EBUSY;
+
+	if (strcmp(buf, "btt") == 0 || strcmp(buf, "btt\n") == 0)
+		ndns->claim_class = btt_claim_class(dev);
+	else if (strcmp(buf, "pfn") == 0 || strcmp(buf, "pfn\n") == 0)
+		ndns->claim_class = NVDIMM_CCLASS_PFN;
+	else if (strcmp(buf, "dax") == 0 || strcmp(buf, "dax\n") == 0)
+		ndns->claim_class = NVDIMM_CCLASS_DAX;
+	else if (strcmp(buf, "") == 0 || strcmp(buf, "\n") == 0)
+		ndns->claim_class = NVDIMM_CCLASS_NONE;
+	else
+		return -EINVAL;
+
+	/* btt_claim_class() could've returned an error */
+	if (ndns->claim_class < 0)
+		return ndns->claim_class;
+
+	return 0;
+}
+
+static ssize_t holder_class_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = __holder_class_store(dev, buf);
+	if (rc >= 0)
+		rc = nd_namespace_label_update(nd_region, dev);
+	dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc < 0 ? rc : len;
+}
+
+static ssize_t holder_class_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (ndns->claim_class == NVDIMM_CCLASS_NONE)
+		rc = sprintf(buf, "\n");
+	else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) ||
+			(ndns->claim_class == NVDIMM_CCLASS_BTT2))
+		rc = sprintf(buf, "btt\n");
+	else if (ndns->claim_class == NVDIMM_CCLASS_PFN)
+		rc = sprintf(buf, "pfn\n");
+	else if (ndns->claim_class == NVDIMM_CCLASS_DAX)
+		rc = sprintf(buf, "dax\n");
+	else
+		rc = sprintf(buf, "<unknown>\n");
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(holder_class);
+
 static ssize_t mode_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -1440,6 +1599,7 @@ static struct attribute *nd_namespace_attributes[] = {
 	&dev_attr_force_raw.attr,
 	&dev_attr_sector_size.attr,
 	&dev_attr_dpa_extents.attr,
+	&dev_attr_holder_class.attr,
 	NULL,
 };
 
@@ -1458,14 +1618,12 @@ static umode_t namespace_visible(struct kobject *kobj,
 		if (a == &dev_attr_size.attr)
 			return 0644;
 
-		if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr)
-			return 0;
-
 		return a->mode;
 	}
 
 	if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
 			|| a == &dev_attr_holder.attr
+			|| a == &dev_attr_holder_class.attr
 			|| a == &dev_attr_force_raw.attr
 			|| a == &dev_attr_mode.attr)
 		return a->mode;
@@ -1599,6 +1757,8 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
 
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nd_interleave_set *nd_set = nd_region->nd_set;
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 		struct nd_label_ent *label_ent;
 		bool found_uuid = false;
 
@@ -1619,8 +1779,17 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
 			if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0)
 				continue;
 
+			if (namespace_label_has(ndd, type_guid)
+					&& !guid_equal(&nd_set->type_guid,
+						&nd_label->type_guid)) {
+				dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n",
+						nd_set->type_guid.b,
+						nd_label->type_guid.b);
+				continue;
+			}
+
 			if (found_uuid) {
-				dev_dbg(to_ndd(nd_mapping)->dev,
+				dev_dbg(ndd->dev,
 						"%s duplicate entry for uuid\n",
 						__func__);
 				return false;
@@ -1698,10 +1867,11 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
  * @nd_label: target pmem namespace label to evaluate
  */
 struct device *create_namespace_pmem(struct nd_region *nd_region,
+		struct nd_namespace_index *nsindex,
 		struct nd_namespace_label *nd_label)
 {
+	u64 cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
 	u64 altcookie = nd_region_interleave_set_altcookie(nd_region);
-	u64 cookie = nd_region_interleave_set_cookie(nd_region);
 	struct nd_label_ent *label_ent;
 	struct nd_namespace_pmem *nspm;
 	struct nd_mapping *nd_mapping;
@@ -1775,6 +1945,7 @@ struct device *create_namespace_pmem(struct nd_region *nd_region,
 	/* Calculate total size and populate namespace properties from label0 */
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_namespace_label *label0;
+		struct nvdimm_drvdata *ndd;
 
 		nd_mapping = &nd_region->mapping[i];
 		label_ent = list_first_entry_or_null(&nd_mapping->labels,
@@ -1794,6 +1965,12 @@ struct device *create_namespace_pmem(struct nd_region *nd_region,
 				NSLABEL_NAME_LEN, GFP_KERNEL);
 		nspm->uuid = kmemdup((void __force *) label0->uuid,
 				NSLABEL_UUID_LEN, GFP_KERNEL);
+		nspm->lbasize = __le64_to_cpu(label0->lbasize);
+		ndd = to_ndd(nd_mapping);
+		if (namespace_label_has(ndd, abstraction_guid))
+			nspm->nsio.common.claim_class
+				= to_nvdimm_cclass(&label0->abstraction_guid);
+
 	}
 
 	if (!nspm->alt_name || !nspm->uuid) {
@@ -1876,7 +2053,7 @@ static struct device *nd_namespace_pmem_create(struct nd_region *nd_region)
 	struct resource *res;
 	struct device *dev;
 
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!is_memory(&nd_region->dev))
 		return NULL;
 
 	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
@@ -2005,12 +2182,29 @@ struct device *create_namespace_blk(struct nd_region *nd_region,
 {
 
 	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	struct nd_namespace_blk *nsblk;
 	char name[NSLABEL_NAME_LEN];
 	struct device *dev = NULL;
 	struct resource *res;
 
+	if (namespace_label_has(ndd, type_guid)) {
+		if (!guid_equal(&nd_set->type_guid, &nd_label->type_guid)) {
+			dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n",
+					nd_set->type_guid.b,
+					nd_label->type_guid.b);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		if (nd_label->isetcookie != __cpu_to_le64(nd_set->cookie2)) {
+			dev_dbg(ndd->dev, "expect cookie %#llx got %#llx\n",
+					nd_set->cookie2,
+					__le64_to_cpu(nd_label->isetcookie));
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
 	nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
 	if (!nsblk)
 		return ERR_PTR(-ENOMEM);
@@ -2021,6 +2215,9 @@ struct device *create_namespace_blk(struct nd_region *nd_region,
 	nsblk->lbasize = __le64_to_cpu(nd_label->lbasize);
 	nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN,
 			GFP_KERNEL);
+	if (namespace_label_has(ndd, abstraction_guid))
+		nsblk->common.claim_class
+			= to_nvdimm_cclass(&nd_label->abstraction_guid);
 	if (!nsblk->uuid)
 		goto blk_err;
 	memcpy(name, nd_label->name, NSLABEL_NAME_LEN);
@@ -2102,27 +2299,30 @@ static struct device **scan_labels(struct nd_region *nd_region)
 		kfree(devs);
 		devs = __devs;
 
-		if (is_nd_blk(&nd_region->dev)) {
+		if (is_nd_blk(&nd_region->dev))
 			dev = create_namespace_blk(nd_region, nd_label, count);
-			if (IS_ERR(dev))
+		else {
+			struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+			struct nd_namespace_index *nsindex;
+
+			nsindex = to_namespace_index(ndd, ndd->ns_current);
+			dev = create_namespace_pmem(nd_region, nsindex, nd_label);
+		}
+
+		if (IS_ERR(dev)) {
+			switch (PTR_ERR(dev)) {
+			case -EAGAIN:
+				/* skip invalid labels */
+				continue;
+			case -ENODEV:
+				/* fallthrough to seed creation */
+				break;
+			default:
 				goto err;
+			}
+		} else
 			devs[count++] = dev;
-		} else {
-			dev = create_namespace_pmem(nd_region, nd_label);
-			if (IS_ERR(dev)) {
-				switch (PTR_ERR(dev)) {
-				case -EAGAIN:
-					/* skip invalid labels */
-					continue;
-				case -ENODEV:
-					/* fallthrough to seed creation */
-					break;
-				default:
-					goto err;
-				}
-			} else
-				devs[count++] = dev;
-		}
+
 	}
 
 	dev_dbg(&nd_region->dev, "%s: discovered %d %s namespace%s\n",
@@ -2156,7 +2356,7 @@ static struct device **scan_labels(struct nd_region *nd_region)
 		}
 		dev->parent = &nd_region->dev;
 		devs[count++] = dev;
-	} else if (is_nd_pmem(&nd_region->dev)) {
+	} else if (is_memory(&nd_region->dev)) {
 		/* clean unselected labels */
 		for (i = 0; i < nd_region->ndr_mappings; i++) {
 			struct list_head *l, *e;

+ 9 - 0
drivers/nvdimm/nd-core.h

@@ -64,7 +64,16 @@ struct blk_alloc_info {
 
 bool is_nvdimm(struct device *dev);
 bool is_nd_pmem(struct device *dev);
+bool is_nd_volatile(struct device *dev);
 bool is_nd_blk(struct device *dev);
+static inline bool is_nd_region(struct device *dev)
+{
+	return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev);
+}
+static inline bool is_memory(struct device *dev)
+{
+	return is_nd_pmem(dev) || is_nd_volatile(dev);
+}
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);

+ 15 - 2
drivers/nvdimm/nd.h

@@ -42,7 +42,7 @@ struct nd_poison {
 
 struct nvdimm_drvdata {
 	struct device *dev;
-	int nsindex_size;
+	int nsindex_size, nslabel_size;
 	struct nd_cmd_get_config_size nsarea;
 	void *data;
 	int ns_current, ns_next;
@@ -96,6 +96,12 @@ static inline struct nd_namespace_index *to_next_namespace_index(
 	return to_namespace_index(ndd, ndd->ns_next);
 }
 
+unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
+
+#define namespace_label_has(ndd, field) \
+	(offsetof(struct nd_namespace_label, field) \
+		< sizeof_namespace_label(ndd))
+
 #define nd_dbg_dpa(r, d, res, fmt, arg...) \
 	dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \
 		(r) ? dev_name((d)->dev) : "", res ? res->name : "null", \
@@ -155,6 +161,7 @@ struct nd_region {
 	u64 ndr_start;
 	int id, num_lanes, ro, numa_node;
 	void *provider_data;
+	struct kernfs_node *bb_state;
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
@@ -188,6 +195,9 @@ struct nd_btt {
 	u64 size;
 	u8 *uuid;
 	int id;
+	int initial_offset;
+	u16 version_major;
+	u16 version_minor;
 };
 
 enum nd_pfn_mode {
@@ -229,6 +239,7 @@ ssize_t nd_sector_size_store(struct device *dev, const char *buf,
 		unsigned long *current_lbasize, const unsigned long *supported);
 int __init nvdimm_init(void);
 int __init nd_region_init(void);
+int __init nd_label_init(void);
 void nvdimm_exit(void);
 void nd_region_exit(void);
 struct nvdimm;
@@ -330,7 +341,8 @@ static inline struct device *nd_dax_create(struct nd_region *nd_region)
 struct nd_region *to_nd_region(struct device *dev);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
-u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region,
+		struct nd_namespace_index *nsindex);
 u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region);
 void nvdimm_bus_lock(struct device *dev);
 void nvdimm_bus_unlock(struct device *dev);
@@ -349,6 +361,7 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
 int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt);
 const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name);
+unsigned int pmem_sector_size(struct nd_namespace_common *ndns);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
 		struct badblocks *bb, const struct resource *res);
 #if IS_ENABLED(CONFIG_ND_CLAIM)

+ 10 - 2
drivers/nvdimm/pfn_devs.c

@@ -331,7 +331,7 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
 	struct nd_pfn *nd_pfn;
 	struct device *dev;
 
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!is_memory(&nd_region->dev))
 		return NULL;
 
 	nd_pfn = nd_pfn_alloc(nd_region);
@@ -354,7 +354,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 	if (!pfn_sb || !ndns)
 		return -ENODEV;
 
-	if (!is_nd_pmem(nd_pfn->dev.parent))
+	if (!is_memory(nd_pfn->dev.parent))
 		return -ENODEV;
 
 	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
@@ -471,6 +471,14 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 	if (ndns->force_raw)
 		return -ENODEV;
 
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+	case NVDIMM_CCLASS_PFN:
+		break;
+	default:
+		return -ENODEV;
+	}
+
 	nvdimm_bus_lock(&ndns->dev);
 	nd_pfn = nd_pfn_alloc(nd_region);
 	pfn_dev = nd_pfn_devinit(nd_pfn, ndns);

+ 57 - 6
drivers/nvdimm/pmem.c

@@ -28,7 +28,7 @@
 #include <linux/blk-mq.h>
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
-#include <linux/pmem.h>
+#include <linux/uio.h>
 #include <linux/dax.h>
 #include <linux/nd.h>
 #include "pmem.h"
@@ -68,9 +68,11 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 				(unsigned long long) sector, cleared,
 				cleared > 1 ? "s" : "");
 		badblocks_clear(&pmem->bb, sector, cleared);
+		if (pmem->bb_state)
+			sysfs_notify_dirent(pmem->bb_state);
 	}
 
-	invalidate_pmem(pmem->virt_addr + offset, len);
+	arch_invalidate_pmem(pmem->virt_addr + offset, len);
 
 	return rc;
 }
@@ -80,7 +82,7 @@ static void write_pmem(void *pmem_addr, struct page *page,
 {
 	void *mem = kmap_atomic(page);
 
-	memcpy_to_pmem(pmem_addr, mem + off, len);
+	memcpy_flushcache(pmem_addr, mem + off, len);
 	kunmap_atomic(mem);
 }
 
@@ -235,8 +237,27 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
 	return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter_flushcache(addr, bytes, i);
+}
+
+static void pmem_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t size)
+{
+	arch_wb_cache_pmem(addr, size);
+}
+
 static const struct dax_operations pmem_dax_ops = {
 	.direct_access = pmem_dax_direct_access,
+	.copy_from_iter = pmem_copy_from_iter,
+	.flush = pmem_dax_flush,
+};
+
+static const struct attribute_group *pmem_attribute_groups[] = {
+	&dax_attribute_group,
+	NULL,
 };
 
 static void pmem_release_queue(void *q)
@@ -265,14 +286,15 @@ static int pmem_attach_disk(struct device *dev,
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct vmem_altmap __altmap, *altmap = NULL;
+	int nid = dev_to_node(dev), fua, wbc;
 	struct resource *res = &nsio->res;
 	struct nd_pfn *nd_pfn = NULL;
 	struct dax_device *dax_dev;
-	int nid = dev_to_node(dev);
 	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
 	struct resource pfn_res;
 	struct request_queue *q;
+	struct device *gendev;
 	struct gendisk *disk;
 	void *addr;
 
@@ -294,8 +316,12 @@ static int pmem_attach_disk(struct device *dev,
 	dev_set_drvdata(dev, pmem);
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
-	if (nvdimm_has_flush(nd_region) < 0)
+	fua = nvdimm_has_flush(nd_region);
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
+		fua = 0;
+	}
+	wbc = nvdimm_has_cache(nd_region);
 
 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
 				dev_name(&ndns->dev))) {
@@ -339,9 +365,10 @@ static int pmem_attach_disk(struct device *dev,
 		return PTR_ERR(addr);
 	pmem->virt_addr = addr;
 
-	blk_queue_write_cache(q, true, true);
+	blk_queue_write_cache(q, wbc, fua);
 	blk_queue_make_request(q, pmem_make_request);
 	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
@@ -368,14 +395,23 @@ static int pmem_attach_disk(struct device *dev,
 		put_disk(disk);
 		return -ENOMEM;
 	}
+	dax_write_cache(dax_dev, wbc);
 	pmem->dax_dev = dax_dev;
 
+	gendev = disk_to_dev(disk);
+	gendev->groups = pmem_attribute_groups;
+
 	device_add_disk(dev, disk);
 	if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
 		return -ENOMEM;
 
 	revalidate_disk(disk);
 
+	pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
+					  "badblocks");
+	if (!pmem->bb_state)
+		dev_warn(dev, "'badblocks' notification disabled\n");
+
 	return 0;
 }
 
@@ -407,8 +443,18 @@ static int nd_pmem_probe(struct device *dev)
 
 static int nd_pmem_remove(struct device *dev)
 {
+	struct pmem_device *pmem = dev_get_drvdata(dev);
+
 	if (is_nd_btt(dev))
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
+	else {
+		/*
+		 * Note, this assumes device_lock() context to not race
+		 * nd_pmem_notify()
+		 */
+		sysfs_put(pmem->bb_state);
+		pmem->bb_state = NULL;
+	}
 	nvdimm_flush(to_nd_region(dev->parent));
 
 	return 0;
@@ -427,6 +473,7 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 	struct nd_namespace_io *nsio;
 	struct resource res;
 	struct badblocks *bb;
+	struct kernfs_node *bb_state;
 
 	if (event != NVDIMM_REVALIDATE_POISON)
 		return;
@@ -438,11 +485,13 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 		nd_region = to_nd_region(ndns->dev.parent);
 		nsio = to_nd_namespace_io(&ndns->dev);
 		bb = &nsio->bb;
+		bb_state = NULL;
 	} else {
 		struct pmem_device *pmem = dev_get_drvdata(dev);
 
 		nd_region = to_region(pmem);
 		bb = &pmem->bb;
+		bb_state = pmem->bb_state;
 
 		if (is_nd_pfn(dev)) {
 			struct nd_pfn *nd_pfn = to_nd_pfn(dev);
@@ -462,6 +511,8 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 	res.start = nsio->res.start + offset;
 	res.end = nsio->res.end - end_trunc;
 	nvdimm_badblocks_populate(nd_region, bb, &res);
+	if (bb_state)
+		sysfs_notify_dirent(bb_state);
 }
 
 MODULE_ALIAS("pmem");

+ 15 - 0
drivers/nvdimm/pmem.h

@@ -5,6 +5,20 @@
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
 
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
+void arch_wb_cache_pmem(void *addr, size_t size);
+void arch_invalidate_pmem(void *addr, size_t size);
+#else
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
+{
+}
+static inline void arch_invalidate_pmem(void *addr, size_t size)
+{
+}
+#endif
+
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
 	/* One contiguous memory region per device */
@@ -17,6 +31,7 @@ struct pmem_device {
 	size_t			size;
 	/* trim size when namespace capacity has been section aligned */
 	u32			pfn_pad;
+	struct kernfs_node	*bb_state;
 	struct badblocks	bb;
 	struct dax_device	*dax_dev;
 	struct gendisk		*disk;

+ 15 - 2
drivers/nvdimm/region.c

@@ -58,10 +58,14 @@ static int nd_region_probe(struct device *dev)
 
 		if (devm_init_badblocks(dev, &nd_region->bb))
 			return -ENODEV;
+		nd_region->bb_state = sysfs_get_dirent(nd_region->dev.kobj.sd,
+						       "badblocks");
+		if (!nd_region->bb_state)
+			dev_warn(&nd_region->dev,
+					"'badblocks' notification disabled\n");
 		ndr_res.start = nd_region->ndr_start;
 		ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1;
-		nvdimm_badblocks_populate(nd_region,
-				&nd_region->bb, &ndr_res);
+		nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res);
 	}
 
 	nd_region->btt_seed = nd_btt_create(nd_region);
@@ -105,6 +109,13 @@ static int nd_region_remove(struct device *dev)
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 
+	/*
+	 * Note, this assumes device_lock() context to not race
+	 * nd_region_notify()
+	 */
+	sysfs_put(nd_region->bb_state);
+	nd_region->bb_state = NULL;
+
 	return 0;
 }
 
@@ -126,6 +137,8 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 				nd_region->ndr_size - 1;
 			nvdimm_badblocks_populate(nd_region,
 					&nd_region->bb, &res);
+			if (nd_region->bb_state)
+				sysfs_notify_dirent(nd_region->bb_state);
 		}
 	}
 	device_for_each_child(dev, &event, child_notify);

+ 64 - 24
drivers/nvdimm/region_devs.c

@@ -15,7 +15,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
-#include <linux/pmem.h>
 #include <linux/sort.h>
 #include <linux/io.h>
 #include <linux/nd.h>
@@ -169,6 +168,11 @@ bool is_nd_blk(struct device *dev)
 	return dev ? dev->type == &nd_blk_device_type : false;
 }
 
+bool is_nd_volatile(struct device *dev)
+{
+	return dev ? dev->type == &nd_volatile_device_type : false;
+}
+
 struct nd_region *to_nd_region(struct device *dev)
 {
 	struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
@@ -215,7 +219,7 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
  */
 int nd_region_to_nstype(struct nd_region *nd_region)
 {
-	if (is_nd_pmem(&nd_region->dev)) {
+	if (is_memory(&nd_region->dev)) {
 		u16 i, alias;
 
 		for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
@@ -243,7 +247,7 @@ static ssize_t size_show(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev);
 	unsigned long long size = 0;
 
-	if (is_nd_pmem(dev)) {
+	if (is_memory(dev)) {
 		size = nd_region->ndr_size;
 	} else if (nd_region->ndr_mappings == 1) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
@@ -307,13 +311,41 @@ static ssize_t set_cookie_show(struct device *dev,
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 	struct nd_interleave_set *nd_set = nd_region->nd_set;
+	ssize_t rc = 0;
 
-	if (is_nd_pmem(dev) && nd_set)
+	if (is_memory(dev) && nd_set)
 		/* pass, should be precluded by region_visible */;
 	else
 		return -ENXIO;
 
-	return sprintf(buf, "%#llx\n", nd_set->cookie);
+	/*
+	 * The cookie to show depends on which specification of the
+	 * labels we are using. If there are not labels then default to
+	 * the v1.1 namespace label cookie definition. To read all this
+	 * data we need to wait for probing to settle.
+	 */
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	if (nd_region->ndr_mappings) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+		if (ndd) {
+			struct nd_namespace_index *nsindex;
+
+			nsindex = to_namespace_index(ndd, ndd->ns_current);
+			rc = sprintf(buf, "%#llx\n",
+					nd_region_interleave_set_cookie(nd_region,
+						nsindex));
+		}
+	}
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	if (rc)
+		return rc;
+	return sprintf(buf, "%#llx\n", nd_set->cookie1);
 }
 static DEVICE_ATTR_RO(set_cookie);
 
@@ -335,7 +367,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 		if (!ndd)
 			return 0;
 
-		if (is_nd_pmem(&nd_region->dev)) {
+		if (is_memory(&nd_region->dev)) {
 			available += nd_pmem_available_dpa(nd_region,
 					nd_mapping, &overlap);
 			if (overlap > blk_max_overlap) {
@@ -521,10 +553,10 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	struct nd_interleave_set *nd_set = nd_region->nd_set;
 	int type = nd_region_to_nstype(nd_region);
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
+	if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
+	if (!is_memory(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
 	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
@@ -552,7 +584,7 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 				|| type == ND_DEVICE_NAMESPACE_BLK)
 			&& a == &dev_attr_available_size.attr)
 		return a->mode;
-	else if (is_nd_pmem(dev) && nd_set)
+	else if (is_memory(dev) && nd_set)
 		return a->mode;
 
 	return 0;
@@ -564,13 +596,18 @@ struct attribute_group nd_region_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nd_region_attribute_group);
 
-u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region,
+		struct nd_namespace_index *nsindex)
 {
 	struct nd_interleave_set *nd_set = nd_region->nd_set;
 
-	if (nd_set)
-		return nd_set->cookie;
-	return 0;
+	if (!nd_set)
+		return 0;
+
+	if (nsindex && __le16_to_cpu(nsindex->major) == 1
+			&& __le16_to_cpu(nsindex->minor) == 1)
+		return nd_set->cookie1;
+	return nd_set->cookie2;
 }
 
 u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region)
@@ -604,7 +641,7 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 {
 	struct nd_region *nd_region;
 
-	if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
+	if (!probe && is_nd_region(dev)) {
 		int i;
 
 		nd_region = to_nd_region(dev);
@@ -622,12 +659,8 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			if (ndd)
 				atomic_dec(&nvdimm->busy);
 		}
-
-		if (is_nd_pmem(dev))
-			return;
 	}
-	if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent))
-			&& probe) {
+	if (dev->parent && is_nd_region(dev->parent) && probe) {
 		nd_region = to_nd_region(dev->parent);
 		nvdimm_bus_lock(dev);
 		if (nd_region->ns_seed == dev)
@@ -800,7 +833,7 @@ int nd_blk_region_init(struct nd_region *nd_region)
 		return 0;
 
 	if (nd_region->ndr_mappings < 1) {
-		dev_err(dev, "invalid BLK region\n");
+		dev_dbg(dev, "invalid BLK region\n");
 		return -ENXIO;
 	}
 
@@ -1015,8 +1048,8 @@ void nvdimm_flush(struct nd_region *nd_region)
 	 * The first wmb() is needed to 'sfence' all previous writes
 	 * such that they are architecturally visible for the platform
 	 * buffer flush.  Note that we've already arranged for pmem
-	 * writes to avoid the cache via arch_memcpy_to_pmem().  The
-	 * final wmb() ensures ordering for the NVDIMM flush write.
+	 * writes to avoid the cache via memcpy_flushcache().  The final
+	 * wmb() ensures ordering for the NVDIMM flush write.
 	 */
 	wmb();
 	for (i = 0; i < nd_region->ndr_mappings; i++)
@@ -1038,8 +1071,9 @@ int nvdimm_has_flush(struct nd_region *nd_region)
 {
 	int i;
 
-	/* no nvdimm == flushing capability unknown */
-	if (nd_region->ndr_mappings == 0)
+	/* no nvdimm or pmem api == flushing capability unknown */
+	if (nd_region->ndr_mappings == 0
+			|| !IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
 		return -ENXIO;
 
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
@@ -1059,6 +1093,12 @@ int nvdimm_has_flush(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_flush);
 
+int nvdimm_has_cache(struct nd_region *nd_region)
+{
+	return is_nd_pmem(&nd_region->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_cache);
+
 void __exit nd_region_devs_exit(void)
 {
 	ida_destroy(&region_ida);

+ 8 - 0
drivers/s390/block/dcssblk.c

@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/pfn_t.h>
+#include <linux/uio.h>
 #include <linux/dax.h>
 #include <asm/extmem.h>
 #include <asm/io.h>
@@ -43,8 +44,15 @@ static const struct block_device_operations dcssblk_devops = {
 	.release 	= dcssblk_release,
 };
 
+static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev,
+		pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter(addr, bytes, i);
+}
+
 static const struct dax_operations dcssblk_dax_ops = {
 	.direct_access = dcssblk_dax_direct_access,
+	.copy_from_iter = dcssblk_dax_copy_from_iter,
 };
 
 struct dcssblk_dev_info {

+ 5 - 4
fs/dax.c

@@ -25,7 +25,6 @@
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/pagevec.h>
-#include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/uio.h>
@@ -784,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	}
 
 	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
-	wb_cache_pmem(kaddr, size);
+	dax_flush(dax_dev, pgoff, kaddr, size);
 	/*
 	 * After we have flushed the cache, we can clear the dirty tag. There
 	 * cannot be new dirty data in the pfn after the flush has completed as
@@ -976,7 +975,8 @@ int __dax_zero_page_range(struct block_device *bdev,
 			dax_read_unlock(id);
 			return rc;
 		}
-		clear_pmem(kaddr + offset, size);
+		memset(kaddr + offset, 0, size);
+		dax_flush(dax_dev, pgoff, kaddr + offset, size);
 		dax_read_unlock(id);
 	}
 	return 0;
@@ -1055,7 +1055,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			map_len = end - pos;
 
 		if (iov_iter_rw(iter) == WRITE)
-			map_len = copy_from_iter_pmem(kaddr, map_len, iter);
+			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+					map_len, iter);
 		else
 			map_len = copy_to_iter(kaddr, map_len, iter);
 		if (map_len <= 0) {

+ 12 - 0
include/linux/dax.h

@@ -16,8 +16,15 @@ struct dax_operations {
 	 */
 	long (*direct_access)(struct dax_device *, pgoff_t, long,
 			void **, pfn_t *);
+	/* copy_from_iter: required operation for fs-dax direct-i/o */
+	size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
+			struct iov_iter *);
+	/* flush: optional driver-specific cache management after writes */
+	void (*flush)(struct dax_device *, pgoff_t, void *, size_t);
 };
 
+extern struct attribute_group dax_attribute_group;
+
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
 void put_dax(struct dax_device *dax_dev);
@@ -75,6 +82,11 @@ void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		void **kaddr, pfn_t *pfn);
+size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t bytes, struct iov_iter *i);
+void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t size);
+void dax_write_cache(struct dax_device *dax_dev, bool wc);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for

+ 6 - 0
include/linux/device-mapper.h

@@ -132,6 +132,10 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
+typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i);
+typedef void (*dm_dax_flush_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr,
+		size_t size);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
@@ -181,6 +185,8 @@ struct target_type {
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
 	dm_dax_direct_access_fn direct_access;
+	dm_dax_copy_from_iter_fn dax_copy_from_iter;
+	dm_dax_flush_fn dax_flush;
 
 	/* For internal device-mapper use. */
 	struct list_head list;

+ 10 - 1
include/linux/libnvdimm.h

@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/sizes.h>
 #include <linux/types.h>
+#include <linux/uuid.h>
 
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
@@ -54,6 +55,7 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
+	unsigned long bus_dsm_mask;
 	unsigned long cmd_mask;
 	struct module *module;
 	char *provider_name;
@@ -71,9 +73,14 @@ struct nd_cmd_desc {
 };
 
 struct nd_interleave_set {
-	u64 cookie;
+	/* v1.1 definition of the interleave-set-cookie algorithm */
+	u64 cookie1;
+	/* v1.2 definition of the interleave-set-cookie algorithm */
+	u64 cookie2;
 	/* compatibility with initial buggy Linux implementation */
 	u64 altcookie;
+
+	guid_t type_guid;
 };
 
 struct nd_mapping_desc {
@@ -159,9 +166,11 @@ void *nd_region_provider_data(struct nd_region *nd_region);
 void *nd_blk_region_provider_data(struct nd_blk_region *ndbr);
 void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data);
 struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr);
+unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
+int nvdimm_has_cache(struct nd_region *nd_region);
 #endif /* __LIBNVDIMM_H__ */

+ 13 - 0
include/linux/nd.h

@@ -21,6 +21,15 @@ enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
 };
 
+enum nvdimm_claim_class {
+	NVDIMM_CCLASS_NONE,
+	NVDIMM_CCLASS_BTT,
+	NVDIMM_CCLASS_BTT2,
+	NVDIMM_CCLASS_PFN,
+	NVDIMM_CCLASS_DAX,
+	NVDIMM_CCLASS_UNKNOWN,
+};
+
 struct nd_device_driver {
 	struct device_driver drv;
 	unsigned long type;
@@ -41,12 +50,14 @@ static inline struct nd_device_driver *to_nd_device_driver(
  * @force_raw: ignore other personalities for the namespace (e.g. btt)
  * @dev: device model node
  * @claim: when set a another personality has taken ownership of the namespace
+ * @claim_class: restrict claim type to a given class
  * @rw_bytes: access the raw namespace capacity with byte-aligned transfers
  */
 struct nd_namespace_common {
 	int force_raw;
 	struct device dev;
 	struct device *claim;
+	enum nvdimm_claim_class claim_class;
 	int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset,
 			void *buf, size_t size, int rw, unsigned long flags);
 };
@@ -75,12 +86,14 @@ struct nd_namespace_io {
 /**
  * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory
  * @nsio: device and system physical address range to drive
+ * @lbasize: logical sector size for the namespace in block-device-mode
  * @alt_name: namespace name supplied in the dimm label
  * @uuid: namespace name supplied in the dimm label
  * @id: ida allocated id
  */
 struct nd_namespace_pmem {
 	struct nd_namespace_io nsio;
+	unsigned long lbasize;
 	char *alt_name;
 	u8 *uuid;
 	int id;

+ 0 - 142
include/linux/pmem.h

@@ -1,142 +0,0 @@
-/*
- * Copyright(c) 2015 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef __PMEM_H__
-#define __PMEM_H__
-
-#include <linux/io.h>
-#include <linux/uio.h>
-
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
-#include <asm/pmem.h>
-#else
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
-/*
- * These are simply here to enable compilation, all call sites gate
- * calling these symbols with arch_has_pmem_api() and redirect to the
- * implementation in asm/pmem.h.
- */
-static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
-{
-	BUG();
-}
-
-static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
-		struct iov_iter *i)
-{
-	BUG();
-	return 0;
-}
-
-static inline void arch_clear_pmem(void *addr, size_t size)
-{
-	BUG();
-}
-
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
-	BUG();
-}
-
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
-	BUG();
-}
-#endif
-
-static inline bool arch_has_pmem_api(void)
-{
-	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
-}
-
-/**
- * memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Perform a memory copy that results in the destination of the copy
- * being effectively evicted from, or never written to, the processor
- * cache hierarchy after the copy completes.  After memcpy_to_pmem()
- * data may still reside in cpu or platform buffers, so this operation
- * must be followed by a blkdev_issue_flush() on the pmem block device.
- */
-static inline void memcpy_to_pmem(void *dst, const void *src, size_t n)
-{
-	if (arch_has_pmem_api())
-		arch_memcpy_to_pmem(dst, src, n);
-	else
-		memcpy(dst, src, n);
-}
-
-/**
- * copy_from_iter_pmem - copy data from an iterator to PMEM
- * @addr:	PMEM destination address
- * @bytes:	number of bytes to copy
- * @i:		iterator with source data
- *
- * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
- * See blkdev_issue_flush() note for memcpy_to_pmem().
- */
-static inline size_t copy_from_iter_pmem(void *addr, size_t bytes,
-		struct iov_iter *i)
-{
-	if (arch_has_pmem_api())
-		return arch_copy_from_iter_pmem(addr, bytes, i);
-	return copy_from_iter_nocache(addr, bytes, i);
-}
-
-/**
- * clear_pmem - zero a PMEM memory range
- * @addr:	virtual start address
- * @size:	number of bytes to zero
- *
- * Write zeros into the memory range starting at 'addr' for 'size' bytes.
- * See blkdev_issue_flush() note for memcpy_to_pmem().
- */
-static inline void clear_pmem(void *addr, size_t size)
-{
-	if (arch_has_pmem_api())
-		arch_clear_pmem(addr, size);
-	else
-		memset(addr, 0, size);
-}
-
-/**
- * invalidate_pmem - flush a pmem range from the cache hierarchy
- * @addr:	virtual start address
- * @size:	bytes to invalidate (internally aligned to cache line size)
- *
- * For platforms that support clearing poison this flushes any poisoned
- * ranges out of the cache
- */
-static inline void invalidate_pmem(void *addr, size_t size)
-{
-	if (arch_has_pmem_api())
-		arch_invalidate_pmem(addr, size);
-}
-
-/**
- * wb_cache_pmem - write back processor cache for PMEM memory range
- * @addr:	virtual start address
- * @size:	number of bytes to write back
- *
- * Write back the processor cache range starting at 'addr' for 'size' bytes.
- * See blkdev_issue_flush() note for memcpy_to_pmem().
- */
-static inline void wb_cache_pmem(void *addr, size_t size)
-{
-	if (arch_has_pmem_api())
-		arch_wb_cache_pmem(addr, size);
-}
-#endif /* __PMEM_H__ */

+ 6 - 0
include/linux/string.h

@@ -122,6 +122,12 @@ static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
 	return 0;
 }
 #endif
+#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
+static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+	memcpy(dst, src, cnt);
+}
+#endif
 void *memchr_inv(const void *s, int c, size_t n);
 char *strreplace(char *s, char old, char new);
 

+ 15 - 0
include/linux/uio.h

@@ -95,6 +95,21 @@ size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
 bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+/*
+ * Note, users like pmem that depend on the stricter semantics of
+ * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for
+ * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
+ * destination is flushed from the cache on return.
+ */
+size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
+#else
+static inline size_t copy_from_iter_flushcache(void *addr, size_t bytes,
+				       struct iov_iter *i)
+{
+	return copy_from_iter_nocache(addr, bytes, i);
+}
+#endif
 bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);

+ 41 - 1
include/uapi/linux/ndctl.h

@@ -105,7 +105,8 @@ struct nd_cmd_ars_cap {
 	__u32 status;
 	__u32 max_ars_out;
 	__u32 clear_err_unit;
-	__u32 reserved;
+	__u16 flags;
+	__u16 reserved;
 } __packed;
 
 struct nd_cmd_ars_start {
@@ -144,6 +145,43 @@ struct nd_cmd_clear_error {
 	__u64 cleared;
 } __packed;
 
+struct nd_cmd_trans_spa {
+	__u64 spa;
+	__u32 status;
+	__u8  flags;
+	__u8  _reserved[3];
+	__u64 trans_length;
+	__u32 num_nvdimms;
+	struct nd_nvdimm_device {
+		__u32 nfit_device_handle;
+		__u32 _reserved;
+		__u64 dpa;
+	} __packed devices[0];
+
+} __packed;
+
+struct nd_cmd_ars_err_inj {
+	__u64 err_inj_spa_range_base;
+	__u64 err_inj_spa_range_length;
+	__u8  err_inj_options;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_clr {
+	__u64 err_inj_clr_spa_range_base;
+	__u64 err_inj_clr_spa_range_length;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_stat {
+	__u32 status;
+	__u32 inj_err_rec_count;
+	struct nd_error_stat_query_record {
+		__u64 err_inj_stat_spa_range_base;
+		__u64 err_inj_stat_spa_range_length;
+	} __packed record[0];
+} __packed;
+
 enum {
 	ND_CMD_IMPLEMENTED = 0,
 
@@ -169,6 +207,7 @@ enum {
 enum {
 	ND_ARS_VOLATILE = 1,
 	ND_ARS_PERSISTENT = 2,
+	ND_ARS_RETURN_PREV_DATA = 1 << 1,
 	ND_CONFIG_LOCKED = 1,
 };
 
@@ -179,6 +218,7 @@ static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 		[ND_CMD_ARS_START] = "ars_start",
 		[ND_CMD_ARS_STATUS] = "ars_status",
 		[ND_CMD_CLEAR_ERROR] = "clear_error",
+		[ND_CMD_CALL] = "cmd_call",
 	};
 
 	if (cmd < ARRAY_SIZE(names) && names[cmd])

+ 3 - 0
lib/Kconfig

@@ -556,6 +556,9 @@ config ARCH_HAS_SG_CHAIN
 config ARCH_HAS_PMEM_API
 	bool
 
+config ARCH_HAS_UACCESS_FLUSHCACHE
+	bool
+
 config ARCH_HAS_MMIO_FLUSH
 	bool
 

+ 22 - 0
lib/iov_iter.c

@@ -615,6 +615,28 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(copy_from_iter_nocache);
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
+{
+	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
+	iterate_and_advance(i, bytes, v,
+		__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
+					 v.iov_base, v.iov_len),
+		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len),
+		memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
+			v.iov_len)
+	)
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(copy_from_iter_flushcache);
+#endif
+
 bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;

+ 1 - 1
tools/testing/nvdimm/test/nfit.c

@@ -1943,7 +1943,7 @@ static __init int nfit_test_init(void)
 			nfit_test->setup = nfit_test0_setup;
 			break;
 		case 1:
-			nfit_test->num_pm = 1;
+			nfit_test->num_pm = 2;
 			nfit_test->dcr_idx = NUM_DCR;
 			nfit_test->num_dcr = 2;
 			nfit_test->alloc = nfit_test1_alloc;