pmem.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. /*
  2. * Persistent Memory Driver
  3. *
  4. * Copyright (c) 2014-2015, Intel Corporation.
  5. * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
  6. * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
  7. *
  8. * This program is free software; you can redistribute it and/or modify it
  9. * under the terms and conditions of the GNU General Public License,
  10. * version 2, as published by the Free Software Foundation.
  11. *
  12. * This program is distributed in the hope it will be useful, but WITHOUT
  13. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  15. * more details.
  16. */
  17. #include <asm/cacheflush.h>
  18. #include <linux/blkdev.h>
  19. #include <linux/hdreg.h>
  20. #include <linux/init.h>
  21. #include <linux/platform_device.h>
  22. #include <linux/module.h>
  23. #include <linux/moduleparam.h>
  24. #include <linux/badblocks.h>
  25. #include <linux/memremap.h>
  26. #include <linux/vmalloc.h>
  27. #include <linux/pfn_t.h>
  28. #include <linux/slab.h>
  29. #include <linux/pmem.h>
  30. #include <linux/nd.h>
  31. #include "pfn.h"
  32. #include "nd.h"
  33. struct pmem_device {
  34. struct request_queue *pmem_queue;
  35. struct gendisk *pmem_disk;
  36. struct nd_namespace_common *ndns;
  37. /* One contiguous memory region per device */
  38. phys_addr_t phys_addr;
  39. /* when non-zero this device is hosting a 'pfn' instance */
  40. phys_addr_t data_offset;
  41. u64 pfn_flags;
  42. void __pmem *virt_addr;
  43. size_t size;
  44. struct badblocks bb;
  45. };
  46. static int pmem_major;
  47. static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
  48. {
  49. if (bb->count) {
  50. sector_t first_bad;
  51. int num_bad;
  52. return !!badblocks_check(bb, sector, len / 512, &first_bad,
  53. &num_bad);
  54. }
  55. return false;
  56. }
  57. static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
  58. unsigned int len, unsigned int off, int rw,
  59. sector_t sector)
  60. {
  61. void *mem = kmap_atomic(page);
  62. phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
  63. void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
  64. if (rw == READ) {
  65. if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
  66. return -EIO;
  67. memcpy_from_pmem(mem + off, pmem_addr, len);
  68. flush_dcache_page(page);
  69. } else {
  70. flush_dcache_page(page);
  71. memcpy_to_pmem(pmem_addr, mem + off, len);
  72. }
  73. kunmap_atomic(mem);
  74. return 0;
  75. }
  76. static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
  77. {
  78. int rc = 0;
  79. bool do_acct;
  80. unsigned long start;
  81. struct bio_vec bvec;
  82. struct bvec_iter iter;
  83. struct block_device *bdev = bio->bi_bdev;
  84. struct pmem_device *pmem = bdev->bd_disk->private_data;
  85. do_acct = nd_iostat_start(bio, &start);
  86. bio_for_each_segment(bvec, bio, iter) {
  87. rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
  88. bvec.bv_offset, bio_data_dir(bio),
  89. iter.bi_sector);
  90. if (rc) {
  91. bio->bi_error = rc;
  92. break;
  93. }
  94. }
  95. if (do_acct)
  96. nd_iostat_end(bio, start);
  97. if (bio_data_dir(bio))
  98. wmb_pmem();
  99. bio_endio(bio);
  100. return BLK_QC_T_NONE;
  101. }
  102. static int pmem_rw_page(struct block_device *bdev, sector_t sector,
  103. struct page *page, int rw)
  104. {
  105. struct pmem_device *pmem = bdev->bd_disk->private_data;
  106. int rc;
  107. rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
  108. if (rw & WRITE)
  109. wmb_pmem();
  110. /*
  111. * The ->rw_page interface is subtle and tricky. The core
  112. * retries on any error, so we can only invoke page_endio() in
  113. * the successful completion case. Otherwise, we'll see crashes
  114. * caused by double completion.
  115. */
  116. if (rc == 0)
  117. page_endio(page, rw & WRITE, 0);
  118. return rc;
  119. }
  120. static long pmem_direct_access(struct block_device *bdev, sector_t sector,
  121. void __pmem **kaddr, pfn_t *pfn)
  122. {
  123. struct pmem_device *pmem = bdev->bd_disk->private_data;
  124. resource_size_t offset = sector * 512 + pmem->data_offset;
  125. *kaddr = pmem->virt_addr + offset;
  126. *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
  127. return pmem->size - offset;
  128. }
  129. static const struct block_device_operations pmem_fops = {
  130. .owner = THIS_MODULE,
  131. .rw_page = pmem_rw_page,
  132. .direct_access = pmem_direct_access,
  133. .revalidate_disk = nvdimm_revalidate_disk,
  134. };
  135. static struct pmem_device *pmem_alloc(struct device *dev,
  136. struct resource *res, int id)
  137. {
  138. struct pmem_device *pmem;
  139. struct request_queue *q;
  140. pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
  141. if (!pmem)
  142. return ERR_PTR(-ENOMEM);
  143. pmem->phys_addr = res->start;
  144. pmem->size = resource_size(res);
  145. if (!arch_has_wmb_pmem())
  146. dev_warn(dev, "unable to guarantee persistence of writes\n");
  147. if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
  148. dev_name(dev))) {
  149. dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
  150. &pmem->phys_addr, pmem->size);
  151. return ERR_PTR(-EBUSY);
  152. }
  153. q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
  154. if (!q)
  155. return ERR_PTR(-ENOMEM);
  156. pmem->pfn_flags = PFN_DEV;
  157. if (pmem_should_map_pages(dev)) {
  158. pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
  159. &q->q_usage_counter, NULL);
  160. pmem->pfn_flags |= PFN_MAP;
  161. } else
  162. pmem->virt_addr = (void __pmem *) devm_memremap(dev,
  163. pmem->phys_addr, pmem->size,
  164. ARCH_MEMREMAP_PMEM);
  165. if (IS_ERR(pmem->virt_addr)) {
  166. blk_cleanup_queue(q);
  167. return (void __force *) pmem->virt_addr;
  168. }
  169. pmem->pmem_queue = q;
  170. return pmem;
  171. }
  172. static void pmem_detach_disk(struct pmem_device *pmem)
  173. {
  174. if (!pmem->pmem_disk)
  175. return;
  176. del_gendisk(pmem->pmem_disk);
  177. put_disk(pmem->pmem_disk);
  178. blk_cleanup_queue(pmem->pmem_queue);
  179. }
  180. static int pmem_attach_disk(struct device *dev,
  181. struct nd_namespace_common *ndns, struct pmem_device *pmem)
  182. {
  183. int nid = dev_to_node(dev);
  184. struct gendisk *disk;
  185. blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
  186. blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
  187. blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
  188. blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
  189. queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
  190. disk = alloc_disk_node(0, nid);
  191. if (!disk) {
  192. blk_cleanup_queue(pmem->pmem_queue);
  193. return -ENOMEM;
  194. }
  195. disk->major = pmem_major;
  196. disk->first_minor = 0;
  197. disk->fops = &pmem_fops;
  198. disk->private_data = pmem;
  199. disk->queue = pmem->pmem_queue;
  200. disk->flags = GENHD_FL_EXT_DEVT;
  201. nvdimm_namespace_disk_name(ndns, disk->disk_name);
  202. disk->driverfs_dev = dev;
  203. set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
  204. pmem->pmem_disk = disk;
  205. devm_exit_badblocks(dev, &pmem->bb);
  206. if (devm_init_badblocks(dev, &pmem->bb))
  207. return -ENOMEM;
  208. nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
  209. disk->bb = &pmem->bb;
  210. add_disk(disk);
  211. revalidate_disk(disk);
  212. return 0;
  213. }
  214. static int pmem_rw_bytes(struct nd_namespace_common *ndns,
  215. resource_size_t offset, void *buf, size_t size, int rw)
  216. {
  217. struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
  218. if (unlikely(offset + size > pmem->size)) {
  219. dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
  220. return -EFAULT;
  221. }
  222. if (rw == READ) {
  223. unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
  224. if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
  225. return -EIO;
  226. memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
  227. } else {
  228. memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
  229. wmb_pmem();
  230. }
  231. return 0;
  232. }
  233. static int nd_pfn_init(struct nd_pfn *nd_pfn)
  234. {
  235. struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
  236. struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
  237. struct nd_namespace_common *ndns = nd_pfn->ndns;
  238. struct nd_region *nd_region;
  239. unsigned long npfns;
  240. phys_addr_t offset;
  241. u64 checksum;
  242. int rc;
  243. if (!pfn_sb)
  244. return -ENOMEM;
  245. nd_pfn->pfn_sb = pfn_sb;
  246. rc = nd_pfn_validate(nd_pfn);
  247. if (rc == -ENODEV)
  248. /* no info block, do init */;
  249. else
  250. return rc;
  251. nd_region = to_nd_region(nd_pfn->dev.parent);
  252. if (nd_region->ro) {
  253. dev_info(&nd_pfn->dev,
  254. "%s is read-only, unable to init metadata\n",
  255. dev_name(&nd_region->dev));
  256. goto err;
  257. }
  258. memset(pfn_sb, 0, sizeof(*pfn_sb));
  259. npfns = (pmem->size - SZ_8K) / SZ_4K;
  260. /*
  261. * Note, we use 64 here for the standard size of struct page,
  262. * debugging options may cause it to be larger in which case the
  263. * implementation will limit the pfns advertised through
  264. * ->direct_access() to those that are included in the memmap.
  265. */
  266. if (nd_pfn->mode == PFN_MODE_PMEM)
  267. offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
  268. else if (nd_pfn->mode == PFN_MODE_RAM)
  269. offset = ALIGN(SZ_8K, nd_pfn->align);
  270. else
  271. goto err;
  272. npfns = (pmem->size - offset) / SZ_4K;
  273. pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
  274. pfn_sb->dataoff = cpu_to_le64(offset);
  275. pfn_sb->npfns = cpu_to_le64(npfns);
  276. memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
  277. memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
  278. memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
  279. pfn_sb->version_major = cpu_to_le16(1);
  280. checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
  281. pfn_sb->checksum = cpu_to_le64(checksum);
  282. rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
  283. if (rc)
  284. goto err;
  285. return 0;
  286. err:
  287. nd_pfn->pfn_sb = NULL;
  288. kfree(pfn_sb);
  289. return -ENXIO;
  290. }
  291. static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
  292. {
  293. struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
  294. struct pmem_device *pmem;
  295. /* free pmem disk */
  296. pmem = dev_get_drvdata(&nd_pfn->dev);
  297. pmem_detach_disk(pmem);
  298. /* release nd_pfn resources */
  299. kfree(nd_pfn->pfn_sb);
  300. nd_pfn->pfn_sb = NULL;
  301. return 0;
  302. }
  303. static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
  304. {
  305. struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
  306. struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
  307. struct device *dev = &nd_pfn->dev;
  308. struct nd_region *nd_region;
  309. struct vmem_altmap *altmap;
  310. struct nd_pfn_sb *pfn_sb;
  311. struct pmem_device *pmem;
  312. struct request_queue *q;
  313. phys_addr_t offset;
  314. int rc;
  315. struct vmem_altmap __altmap = {
  316. .base_pfn = __phys_to_pfn(nsio->res.start),
  317. .reserve = __phys_to_pfn(SZ_8K),
  318. };
  319. if (!nd_pfn->uuid || !nd_pfn->ndns)
  320. return -ENODEV;
  321. nd_region = to_nd_region(dev->parent);
  322. rc = nd_pfn_init(nd_pfn);
  323. if (rc)
  324. return rc;
  325. pfn_sb = nd_pfn->pfn_sb;
  326. offset = le64_to_cpu(pfn_sb->dataoff);
  327. nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
  328. if (nd_pfn->mode == PFN_MODE_RAM) {
  329. if (offset < SZ_8K)
  330. return -EINVAL;
  331. nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
  332. altmap = NULL;
  333. } else if (nd_pfn->mode == PFN_MODE_PMEM) {
  334. nd_pfn->npfns = (resource_size(&nsio->res) - offset)
  335. / PAGE_SIZE;
  336. if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
  337. dev_info(&nd_pfn->dev,
  338. "number of pfns truncated from %lld to %ld\n",
  339. le64_to_cpu(nd_pfn->pfn_sb->npfns),
  340. nd_pfn->npfns);
  341. altmap = & __altmap;
  342. altmap->free = __phys_to_pfn(offset - SZ_8K);
  343. altmap->alloc = 0;
  344. } else {
  345. rc = -ENXIO;
  346. goto err;
  347. }
  348. /* establish pfn range for lookup, and switch to direct map */
  349. pmem = dev_get_drvdata(dev);
  350. q = pmem->pmem_queue;
  351. devm_memunmap(dev, (void __force *) pmem->virt_addr);
  352. pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
  353. &q->q_usage_counter, altmap);
  354. pmem->pfn_flags |= PFN_MAP;
  355. if (IS_ERR(pmem->virt_addr)) {
  356. rc = PTR_ERR(pmem->virt_addr);
  357. goto err;
  358. }
  359. /* attach pmem disk in "pfn-mode" */
  360. pmem->data_offset = offset;
  361. rc = pmem_attach_disk(dev, ndns, pmem);
  362. if (rc)
  363. goto err;
  364. return rc;
  365. err:
  366. nvdimm_namespace_detach_pfn(ndns);
  367. return rc;
  368. }
  369. static int nd_pmem_probe(struct device *dev)
  370. {
  371. struct nd_region *nd_region = to_nd_region(dev->parent);
  372. struct nd_namespace_common *ndns;
  373. struct nd_namespace_io *nsio;
  374. struct pmem_device *pmem;
  375. ndns = nvdimm_namespace_common_probe(dev);
  376. if (IS_ERR(ndns))
  377. return PTR_ERR(ndns);
  378. nsio = to_nd_namespace_io(&ndns->dev);
  379. pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
  380. if (IS_ERR(pmem))
  381. return PTR_ERR(pmem);
  382. pmem->ndns = ndns;
  383. dev_set_drvdata(dev, pmem);
  384. ndns->rw_bytes = pmem_rw_bytes;
  385. if (devm_init_badblocks(dev, &pmem->bb))
  386. return -ENOMEM;
  387. nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
  388. if (is_nd_btt(dev)) {
  389. /* btt allocates its own request_queue */
  390. blk_cleanup_queue(pmem->pmem_queue);
  391. pmem->pmem_queue = NULL;
  392. return nvdimm_namespace_attach_btt(ndns);
  393. }
  394. if (is_nd_pfn(dev))
  395. return nvdimm_namespace_attach_pfn(ndns);
  396. if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
  397. /*
  398. * We'll come back as either btt-pmem, or pfn-pmem, so
  399. * drop the queue allocation for now.
  400. */
  401. blk_cleanup_queue(pmem->pmem_queue);
  402. return -ENXIO;
  403. }
  404. return pmem_attach_disk(dev, ndns, pmem);
  405. }
  406. static int nd_pmem_remove(struct device *dev)
  407. {
  408. struct pmem_device *pmem = dev_get_drvdata(dev);
  409. if (is_nd_btt(dev))
  410. nvdimm_namespace_detach_btt(pmem->ndns);
  411. else if (is_nd_pfn(dev))
  412. nvdimm_namespace_detach_pfn(pmem->ndns);
  413. else
  414. pmem_detach_disk(pmem);
  415. return 0;
  416. }
  417. static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
  418. {
  419. struct pmem_device *pmem = dev_get_drvdata(dev);
  420. struct nd_namespace_common *ndns = pmem->ndns;
  421. if (event != NVDIMM_REVALIDATE_POISON)
  422. return;
  423. if (is_nd_btt(dev))
  424. nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
  425. else
  426. nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
  427. }
  428. MODULE_ALIAS("pmem");
  429. MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
  430. MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
  431. static struct nd_device_driver nd_pmem_driver = {
  432. .probe = nd_pmem_probe,
  433. .remove = nd_pmem_remove,
  434. .notify = nd_pmem_notify,
  435. .drv = {
  436. .name = "nd_pmem",
  437. },
  438. .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
  439. };
  440. static int __init pmem_init(void)
  441. {
  442. int error;
  443. pmem_major = register_blkdev(0, "pmem");
  444. if (pmem_major < 0)
  445. return pmem_major;
  446. error = nd_driver_register(&nd_pmem_driver);
  447. if (error) {
  448. unregister_blkdev(pmem_major, "pmem");
  449. return error;
  450. }
  451. return 0;
  452. }
  453. module_init(pmem_init);
  454. static void pmem_exit(void)
  455. {
  456. driver_unregister(&nd_pmem_driver.drv);
  457. unregister_blkdev(pmem_major, "pmem");
  458. }
  459. module_exit(pmem_exit);
  460. MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
  461. MODULE_LICENSE("GPL v2");