dax.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861
  1. /*
  2. * Copyright(c) 2016 Intel Corporation. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of version 2 of the GNU General Public License as
  6. * published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. */
  13. #include <linux/pagemap.h>
  14. #include <linux/module.h>
  15. #include <linux/device.h>
  16. #include <linux/magic.h>
  17. #include <linux/mount.h>
  18. #include <linux/pfn_t.h>
  19. #include <linux/hash.h>
  20. #include <linux/cdev.h>
  21. #include <linux/slab.h>
  22. #include <linux/dax.h>
  23. #include <linux/fs.h>
  24. #include <linux/mm.h>
  25. #include "dax.h"
  26. static dev_t dax_devt;
  27. static struct class *dax_class;
  28. static DEFINE_IDA(dax_minor_ida);
  29. static int nr_dax = CONFIG_NR_DEV_DAX;
  30. module_param(nr_dax, int, S_IRUGO);
  31. static struct vfsmount *dax_mnt;
  32. static struct kmem_cache *dax_cache __read_mostly;
  33. static struct super_block *dax_superblock __read_mostly;
  34. MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
  35. /**
  36. * struct dax_region - mapping infrastructure for dax devices
  37. * @id: kernel-wide unique region for a memory range
  38. * @base: linear address corresponding to @res
  39. * @kref: to pin while other agents have a need to do lookups
  40. * @dev: parent device backing this region
  41. * @align: allocation and mapping alignment for child dax devices
  42. * @res: physical address range of the region
  43. * @pfn_flags: identify whether the pfns are paged back or not
  44. */
  45. struct dax_region {
  46. int id;
  47. struct ida ida;
  48. void *base;
  49. struct kref kref;
  50. struct device *dev;
  51. unsigned int align;
  52. struct resource res;
  53. unsigned long pfn_flags;
  54. };
  55. /**
  56. * struct dax_dev - subdivision of a dax region
  57. * @region - parent region
  58. * @dev - device backing the character device
  59. * @cdev - core chardev data
  60. * @alive - !alive + rcu grace period == no new mappings can be established
  61. * @id - child id in the region
  62. * @num_resources - number of physical address extents in this device
  63. * @res - array of physical address ranges
  64. */
  65. struct dax_dev {
  66. struct dax_region *region;
  67. struct inode *inode;
  68. struct device dev;
  69. struct cdev cdev;
  70. bool alive;
  71. int id;
  72. int num_resources;
  73. struct resource res[0];
  74. };
  75. static ssize_t id_show(struct device *dev,
  76. struct device_attribute *attr, char *buf)
  77. {
  78. struct dax_region *dax_region;
  79. ssize_t rc = -ENXIO;
  80. device_lock(dev);
  81. dax_region = dev_get_drvdata(dev);
  82. if (dax_region)
  83. rc = sprintf(buf, "%d\n", dax_region->id);
  84. device_unlock(dev);
  85. return rc;
  86. }
  87. static DEVICE_ATTR_RO(id);
  88. static ssize_t region_size_show(struct device *dev,
  89. struct device_attribute *attr, char *buf)
  90. {
  91. struct dax_region *dax_region;
  92. ssize_t rc = -ENXIO;
  93. device_lock(dev);
  94. dax_region = dev_get_drvdata(dev);
  95. if (dax_region)
  96. rc = sprintf(buf, "%llu\n", (unsigned long long)
  97. resource_size(&dax_region->res));
  98. device_unlock(dev);
  99. return rc;
  100. }
  101. static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
  102. region_size_show, NULL);
  103. static ssize_t align_show(struct device *dev,
  104. struct device_attribute *attr, char *buf)
  105. {
  106. struct dax_region *dax_region;
  107. ssize_t rc = -ENXIO;
  108. device_lock(dev);
  109. dax_region = dev_get_drvdata(dev);
  110. if (dax_region)
  111. rc = sprintf(buf, "%u\n", dax_region->align);
  112. device_unlock(dev);
  113. return rc;
  114. }
  115. static DEVICE_ATTR_RO(align);
  116. static struct attribute *dax_region_attributes[] = {
  117. &dev_attr_region_size.attr,
  118. &dev_attr_align.attr,
  119. &dev_attr_id.attr,
  120. NULL,
  121. };
  122. static const struct attribute_group dax_region_attribute_group = {
  123. .name = "dax_region",
  124. .attrs = dax_region_attributes,
  125. };
  126. static const struct attribute_group *dax_region_attribute_groups[] = {
  127. &dax_region_attribute_group,
  128. NULL,
  129. };
  130. static struct inode *dax_alloc_inode(struct super_block *sb)
  131. {
  132. return kmem_cache_alloc(dax_cache, GFP_KERNEL);
  133. }
  134. static void dax_i_callback(struct rcu_head *head)
  135. {
  136. struct inode *inode = container_of(head, struct inode, i_rcu);
  137. kmem_cache_free(dax_cache, inode);
  138. }
  139. static void dax_destroy_inode(struct inode *inode)
  140. {
  141. call_rcu(&inode->i_rcu, dax_i_callback);
  142. }
  143. static const struct super_operations dax_sops = {
  144. .statfs = simple_statfs,
  145. .alloc_inode = dax_alloc_inode,
  146. .destroy_inode = dax_destroy_inode,
  147. .drop_inode = generic_delete_inode,
  148. };
  149. static struct dentry *dax_mount(struct file_system_type *fs_type,
  150. int flags, const char *dev_name, void *data)
  151. {
  152. return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
  153. }
  154. static struct file_system_type dax_type = {
  155. .name = "dax",
  156. .mount = dax_mount,
  157. .kill_sb = kill_anon_super,
  158. };
  159. static int dax_test(struct inode *inode, void *data)
  160. {
  161. return inode->i_cdev == data;
  162. }
  163. static int dax_set(struct inode *inode, void *data)
  164. {
  165. inode->i_cdev = data;
  166. return 0;
  167. }
  168. static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt)
  169. {
  170. struct inode *inode;
  171. inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
  172. dax_test, dax_set, cdev);
  173. if (!inode)
  174. return NULL;
  175. if (inode->i_state & I_NEW) {
  176. inode->i_mode = S_IFCHR;
  177. inode->i_flags = S_DAX;
  178. inode->i_rdev = devt;
  179. mapping_set_gfp_mask(&inode->i_data, GFP_USER);
  180. unlock_new_inode(inode);
  181. }
  182. return inode;
  183. }
  184. static void init_once(void *inode)
  185. {
  186. inode_init_once(inode);
  187. }
  188. static int dax_inode_init(void)
  189. {
  190. int rc;
  191. dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0,
  192. (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
  193. SLAB_MEM_SPREAD|SLAB_ACCOUNT),
  194. init_once);
  195. if (!dax_cache)
  196. return -ENOMEM;
  197. rc = register_filesystem(&dax_type);
  198. if (rc)
  199. goto err_register_fs;
  200. dax_mnt = kern_mount(&dax_type);
  201. if (IS_ERR(dax_mnt)) {
  202. rc = PTR_ERR(dax_mnt);
  203. goto err_mount;
  204. }
  205. dax_superblock = dax_mnt->mnt_sb;
  206. return 0;
  207. err_mount:
  208. unregister_filesystem(&dax_type);
  209. err_register_fs:
  210. kmem_cache_destroy(dax_cache);
  211. return rc;
  212. }
  213. static void dax_inode_exit(void)
  214. {
  215. kern_unmount(dax_mnt);
  216. unregister_filesystem(&dax_type);
  217. kmem_cache_destroy(dax_cache);
  218. }
  219. static void dax_region_free(struct kref *kref)
  220. {
  221. struct dax_region *dax_region;
  222. dax_region = container_of(kref, struct dax_region, kref);
  223. kfree(dax_region);
  224. }
  225. void dax_region_put(struct dax_region *dax_region)
  226. {
  227. kref_put(&dax_region->kref, dax_region_free);
  228. }
  229. EXPORT_SYMBOL_GPL(dax_region_put);
  230. static void dax_region_unregister(void *region)
  231. {
  232. struct dax_region *dax_region = region;
  233. sysfs_remove_groups(&dax_region->dev->kobj,
  234. dax_region_attribute_groups);
  235. dax_region_put(dax_region);
  236. }
  237. struct dax_region *alloc_dax_region(struct device *parent, int region_id,
  238. struct resource *res, unsigned int align, void *addr,
  239. unsigned long pfn_flags)
  240. {
  241. struct dax_region *dax_region;
  242. /*
  243. * The DAX core assumes that it can store its private data in
  244. * parent->driver_data. This WARN is a reminder / safeguard for
  245. * developers of device-dax drivers.
  246. */
  247. if (dev_get_drvdata(parent)) {
  248. dev_WARN(parent, "dax core failed to setup private data\n");
  249. return NULL;
  250. }
  251. if (!IS_ALIGNED(res->start, align)
  252. || !IS_ALIGNED(resource_size(res), align))
  253. return NULL;
  254. dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
  255. if (!dax_region)
  256. return NULL;
  257. dev_set_drvdata(parent, dax_region);
  258. memcpy(&dax_region->res, res, sizeof(*res));
  259. dax_region->pfn_flags = pfn_flags;
  260. kref_init(&dax_region->kref);
  261. dax_region->id = region_id;
  262. ida_init(&dax_region->ida);
  263. dax_region->align = align;
  264. dax_region->dev = parent;
  265. dax_region->base = addr;
  266. if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
  267. kfree(dax_region);
  268. return NULL;;
  269. }
  270. kref_get(&dax_region->kref);
  271. if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region))
  272. return NULL;
  273. return dax_region;
  274. }
  275. EXPORT_SYMBOL_GPL(alloc_dax_region);
  276. static struct dax_dev *to_dax_dev(struct device *dev)
  277. {
  278. return container_of(dev, struct dax_dev, dev);
  279. }
  280. static ssize_t size_show(struct device *dev,
  281. struct device_attribute *attr, char *buf)
  282. {
  283. struct dax_dev *dax_dev = to_dax_dev(dev);
  284. unsigned long long size = 0;
  285. int i;
  286. for (i = 0; i < dax_dev->num_resources; i++)
  287. size += resource_size(&dax_dev->res[i]);
  288. return sprintf(buf, "%llu\n", size);
  289. }
  290. static DEVICE_ATTR_RO(size);
  291. static struct attribute *dax_device_attributes[] = {
  292. &dev_attr_size.attr,
  293. NULL,
  294. };
  295. static const struct attribute_group dax_device_attribute_group = {
  296. .attrs = dax_device_attributes,
  297. };
  298. static const struct attribute_group *dax_attribute_groups[] = {
  299. &dax_device_attribute_group,
  300. NULL,
  301. };
  302. static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
  303. const char *func)
  304. {
  305. struct dax_region *dax_region = dax_dev->region;
  306. struct device *dev = &dax_dev->dev;
  307. unsigned long mask;
  308. if (!dax_dev->alive)
  309. return -ENXIO;
  310. /* prevent private mappings from being established */
  311. if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
  312. dev_info(dev, "%s: %s: fail, attempted private mapping\n",
  313. current->comm, func);
  314. return -EINVAL;
  315. }
  316. mask = dax_region->align - 1;
  317. if (vma->vm_start & mask || vma->vm_end & mask) {
  318. dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
  319. current->comm, func, vma->vm_start, vma->vm_end,
  320. mask);
  321. return -EINVAL;
  322. }
  323. if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
  324. && (vma->vm_flags & VM_DONTCOPY) == 0) {
  325. dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
  326. current->comm, func);
  327. return -EINVAL;
  328. }
  329. if (!vma_is_dax(vma)) {
  330. dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
  331. current->comm, func);
  332. return -EINVAL;
  333. }
  334. return 0;
  335. }
  336. static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
  337. unsigned long size)
  338. {
  339. struct resource *res;
  340. phys_addr_t phys;
  341. int i;
  342. for (i = 0; i < dax_dev->num_resources; i++) {
  343. res = &dax_dev->res[i];
  344. phys = pgoff * PAGE_SIZE + res->start;
  345. if (phys >= res->start && phys <= res->end)
  346. break;
  347. pgoff -= PHYS_PFN(resource_size(res));
  348. }
  349. if (i < dax_dev->num_resources) {
  350. res = &dax_dev->res[i];
  351. if (phys + size - 1 <= res->end)
  352. return phys;
  353. }
  354. return -1;
  355. }
  356. static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
  357. {
  358. struct device *dev = &dax_dev->dev;
  359. struct dax_region *dax_region;
  360. int rc = VM_FAULT_SIGBUS;
  361. phys_addr_t phys;
  362. pfn_t pfn;
  363. unsigned int fault_size = PAGE_SIZE;
  364. if (check_vma(dax_dev, vmf->vma, __func__))
  365. return VM_FAULT_SIGBUS;
  366. dax_region = dax_dev->region;
  367. if (dax_region->align > PAGE_SIZE) {
  368. dev_dbg(dev, "%s: alignment > fault size\n", __func__);
  369. return VM_FAULT_SIGBUS;
  370. }
  371. if (fault_size != dax_region->align)
  372. return VM_FAULT_SIGBUS;
  373. phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
  374. if (phys == -1) {
  375. dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
  376. vmf->pgoff);
  377. return VM_FAULT_SIGBUS;
  378. }
  379. pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
  380. rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
  381. if (rc == -ENOMEM)
  382. return VM_FAULT_OOM;
  383. if (rc < 0 && rc != -EBUSY)
  384. return VM_FAULT_SIGBUS;
  385. return VM_FAULT_NOPAGE;
  386. }
  387. static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
  388. {
  389. unsigned long pmd_addr = vmf->address & PMD_MASK;
  390. struct device *dev = &dax_dev->dev;
  391. struct dax_region *dax_region;
  392. phys_addr_t phys;
  393. pgoff_t pgoff;
  394. pfn_t pfn;
  395. unsigned int fault_size = PMD_SIZE;
  396. if (check_vma(dax_dev, vmf->vma, __func__))
  397. return VM_FAULT_SIGBUS;
  398. dax_region = dax_dev->region;
  399. if (dax_region->align > PMD_SIZE) {
  400. dev_dbg(dev, "%s: alignment > fault size\n", __func__);
  401. return VM_FAULT_SIGBUS;
  402. }
  403. /* dax pmd mappings require pfn_t_devmap() */
  404. if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
  405. dev_dbg(dev, "%s: alignment > fault size\n", __func__);
  406. return VM_FAULT_SIGBUS;
  407. }
  408. if (fault_size < dax_region->align)
  409. return VM_FAULT_SIGBUS;
  410. else if (fault_size > dax_region->align)
  411. return VM_FAULT_FALLBACK;
  412. /* if we are outside of the VMA */
  413. if (pmd_addr < vmf->vma->vm_start ||
  414. (pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
  415. return VM_FAULT_SIGBUS;
  416. pgoff = linear_page_index(vmf->vma, pmd_addr);
  417. phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
  418. if (phys == -1) {
  419. dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
  420. pgoff);
  421. return VM_FAULT_SIGBUS;
  422. }
  423. pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
  424. return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
  425. vmf->flags & FAULT_FLAG_WRITE);
  426. }
  427. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  428. static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
  429. {
  430. unsigned long pud_addr = vmf->address & PUD_MASK;
  431. struct device *dev = &dax_dev->dev;
  432. struct dax_region *dax_region;
  433. phys_addr_t phys;
  434. pgoff_t pgoff;
  435. pfn_t pfn;
  436. unsigned int fault_size = PUD_SIZE;
  437. if (check_vma(dax_dev, vmf->vma, __func__))
  438. return VM_FAULT_SIGBUS;
  439. dax_region = dax_dev->region;
  440. if (dax_region->align > PUD_SIZE) {
  441. dev_dbg(dev, "%s: alignment > fault size\n", __func__);
  442. return VM_FAULT_SIGBUS;
  443. }
  444. /* dax pud mappings require pfn_t_devmap() */
  445. if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
  446. dev_dbg(dev, "%s: alignment > fault size\n", __func__);
  447. return VM_FAULT_SIGBUS;
  448. }
  449. if (fault_size < dax_region->align)
  450. return VM_FAULT_SIGBUS;
  451. else if (fault_size > dax_region->align)
  452. return VM_FAULT_FALLBACK;
  453. /* if we are outside of the VMA */
  454. if (pud_addr < vmf->vma->vm_start ||
  455. (pud_addr + PUD_SIZE) > vmf->vma->vm_end)
  456. return VM_FAULT_SIGBUS;
  457. pgoff = linear_page_index(vmf->vma, pud_addr);
  458. phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE);
  459. if (phys == -1) {
  460. dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
  461. pgoff);
  462. return VM_FAULT_SIGBUS;
  463. }
  464. pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
  465. return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
  466. vmf->flags & FAULT_FLAG_WRITE);
  467. }
  468. #else
  469. static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
  470. {
  471. return VM_FAULT_FALLBACK;
  472. }
  473. #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  474. static int dax_dev_huge_fault(struct vm_fault *vmf,
  475. enum page_entry_size pe_size)
  476. {
  477. int rc;
  478. struct file *filp = vmf->vma->vm_file;
  479. struct dax_dev *dax_dev = filp->private_data;
  480. dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
  481. current->comm, (vmf->flags & FAULT_FLAG_WRITE)
  482. ? "write" : "read",
  483. vmf->vma->vm_start, vmf->vma->vm_end);
  484. rcu_read_lock();
  485. switch (pe_size) {
  486. case PE_SIZE_PTE:
  487. rc = __dax_dev_pte_fault(dax_dev, vmf);
  488. break;
  489. case PE_SIZE_PMD:
  490. rc = __dax_dev_pmd_fault(dax_dev, vmf);
  491. break;
  492. case PE_SIZE_PUD:
  493. rc = __dax_dev_pud_fault(dax_dev, vmf);
  494. break;
  495. default:
  496. return VM_FAULT_FALLBACK;
  497. }
  498. rcu_read_unlock();
  499. return rc;
  500. }
  501. static int dax_dev_fault(struct vm_fault *vmf)
  502. {
  503. return dax_dev_huge_fault(vmf, PE_SIZE_PTE);
  504. }
  505. static const struct vm_operations_struct dax_dev_vm_ops = {
  506. .fault = dax_dev_fault,
  507. .huge_fault = dax_dev_huge_fault,
  508. };
  509. static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
  510. {
  511. struct dax_dev *dax_dev = filp->private_data;
  512. int rc;
  513. dev_dbg(&dax_dev->dev, "%s\n", __func__);
  514. rc = check_vma(dax_dev, vma, __func__);
  515. if (rc)
  516. return rc;
  517. vma->vm_ops = &dax_dev_vm_ops;
  518. vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
  519. return 0;
  520. }
  521. /* return an unmapped area aligned to the dax region specified alignment */
  522. static unsigned long dax_get_unmapped_area(struct file *filp,
  523. unsigned long addr, unsigned long len, unsigned long pgoff,
  524. unsigned long flags)
  525. {
  526. unsigned long off, off_end, off_align, len_align, addr_align, align;
  527. struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
  528. struct dax_region *dax_region;
  529. if (!dax_dev || addr)
  530. goto out;
  531. dax_region = dax_dev->region;
  532. align = dax_region->align;
  533. off = pgoff << PAGE_SHIFT;
  534. off_end = off + len;
  535. off_align = round_up(off, align);
  536. if ((off_end <= off_align) || ((off_end - off_align) < align))
  537. goto out;
  538. len_align = len + align;
  539. if ((off + len_align) < off)
  540. goto out;
  541. addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
  542. pgoff, flags);
  543. if (!IS_ERR_VALUE(addr_align)) {
  544. addr_align += (off - addr_align) & (align - 1);
  545. return addr_align;
  546. }
  547. out:
  548. return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
  549. }
  550. static int dax_open(struct inode *inode, struct file *filp)
  551. {
  552. struct dax_dev *dax_dev;
  553. dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev);
  554. dev_dbg(&dax_dev->dev, "%s\n", __func__);
  555. inode->i_mapping = dax_dev->inode->i_mapping;
  556. inode->i_mapping->host = dax_dev->inode;
  557. filp->f_mapping = inode->i_mapping;
  558. filp->private_data = dax_dev;
  559. inode->i_flags = S_DAX;
  560. return 0;
  561. }
  562. static int dax_release(struct inode *inode, struct file *filp)
  563. {
  564. struct dax_dev *dax_dev = filp->private_data;
  565. dev_dbg(&dax_dev->dev, "%s\n", __func__);
  566. return 0;
  567. }
  568. static const struct file_operations dax_fops = {
  569. .llseek = noop_llseek,
  570. .owner = THIS_MODULE,
  571. .open = dax_open,
  572. .release = dax_release,
  573. .get_unmapped_area = dax_get_unmapped_area,
  574. .mmap = dax_mmap,
  575. };
  576. static void dax_dev_release(struct device *dev)
  577. {
  578. struct dax_dev *dax_dev = to_dax_dev(dev);
  579. struct dax_region *dax_region = dax_dev->region;
  580. ida_simple_remove(&dax_region->ida, dax_dev->id);
  581. ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
  582. dax_region_put(dax_region);
  583. iput(dax_dev->inode);
  584. kfree(dax_dev);
  585. }
  586. static void unregister_dax_dev(void *dev)
  587. {
  588. struct dax_dev *dax_dev = to_dax_dev(dev);
  589. struct cdev *cdev = &dax_dev->cdev;
  590. dev_dbg(dev, "%s\n", __func__);
  591. /*
  592. * Note, rcu is not protecting the liveness of dax_dev, rcu is
  593. * ensuring that any fault handlers that might have seen
  594. * dax_dev->alive == true, have completed. Any fault handlers
  595. * that start after synchronize_rcu() has started will abort
  596. * upon seeing dax_dev->alive == false.
  597. */
  598. dax_dev->alive = false;
  599. synchronize_rcu();
  600. unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1);
  601. cdev_del(cdev);
  602. device_unregister(dev);
  603. }
  604. struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region,
  605. struct resource *res, int count)
  606. {
  607. struct device *parent = dax_region->dev;
  608. struct dax_dev *dax_dev;
  609. int rc = 0, minor, i;
  610. struct device *dev;
  611. struct cdev *cdev;
  612. dev_t dev_t;
  613. dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);
  614. if (!dax_dev)
  615. return ERR_PTR(-ENOMEM);
  616. for (i = 0; i < count; i++) {
  617. if (!IS_ALIGNED(res[i].start, dax_region->align)
  618. || !IS_ALIGNED(resource_size(&res[i]),
  619. dax_region->align)) {
  620. rc = -EINVAL;
  621. break;
  622. }
  623. dax_dev->res[i].start = res[i].start;
  624. dax_dev->res[i].end = res[i].end;
  625. }
  626. if (i < count)
  627. goto err_id;
  628. dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
  629. if (dax_dev->id < 0) {
  630. rc = dax_dev->id;
  631. goto err_id;
  632. }
  633. minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
  634. if (minor < 0) {
  635. rc = minor;
  636. goto err_minor;
  637. }
  638. dev_t = MKDEV(MAJOR(dax_devt), minor);
  639. dev = &dax_dev->dev;
  640. dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t);
  641. if (!dax_dev->inode) {
  642. rc = -ENOMEM;
  643. goto err_inode;
  644. }
  645. /* device_initialize() so cdev can reference kobj parent */
  646. device_initialize(dev);
  647. cdev = &dax_dev->cdev;
  648. cdev_init(cdev, &dax_fops);
  649. cdev->owner = parent->driver->owner;
  650. cdev->kobj.parent = &dev->kobj;
  651. rc = cdev_add(&dax_dev->cdev, dev_t, 1);
  652. if (rc)
  653. goto err_cdev;
  654. /* from here on we're committed to teardown via dax_dev_release() */
  655. dax_dev->num_resources = count;
  656. dax_dev->alive = true;
  657. dax_dev->region = dax_region;
  658. kref_get(&dax_region->kref);
  659. dev->devt = dev_t;
  660. dev->class = dax_class;
  661. dev->parent = parent;
  662. dev->groups = dax_attribute_groups;
  663. dev->release = dax_dev_release;
  664. dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id);
  665. rc = device_add(dev);
  666. if (rc) {
  667. put_device(dev);
  668. return ERR_PTR(rc);
  669. }
  670. rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev);
  671. if (rc)
  672. return ERR_PTR(rc);
  673. return dax_dev;
  674. err_cdev:
  675. iput(dax_dev->inode);
  676. err_inode:
  677. ida_simple_remove(&dax_minor_ida, minor);
  678. err_minor:
  679. ida_simple_remove(&dax_region->ida, dax_dev->id);
  680. err_id:
  681. kfree(dax_dev);
  682. return ERR_PTR(rc);
  683. }
  684. EXPORT_SYMBOL_GPL(devm_create_dax_dev);
  685. static int __init dax_init(void)
  686. {
  687. int rc;
  688. rc = dax_inode_init();
  689. if (rc)
  690. return rc;
  691. nr_dax = max(nr_dax, 256);
  692. rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
  693. if (rc)
  694. goto err_chrdev;
  695. dax_class = class_create(THIS_MODULE, "dax");
  696. if (IS_ERR(dax_class)) {
  697. rc = PTR_ERR(dax_class);
  698. goto err_class;
  699. }
  700. return 0;
  701. err_class:
  702. unregister_chrdev_region(dax_devt, nr_dax);
  703. err_chrdev:
  704. dax_inode_exit();
  705. return rc;
  706. }
  707. static void __exit dax_exit(void)
  708. {
  709. class_destroy(dax_class);
  710. unregister_chrdev_region(dax_devt, nr_dax);
  711. ida_destroy(&dax_minor_ida);
  712. dax_inode_exit();
  713. }
  714. MODULE_AUTHOR("Intel Corporation");
  715. MODULE_LICENSE("GPL v2");
  716. subsys_initcall(dax_init);
  717. module_exit(dax_exit);