hmm.c 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417
  1. /*
  2. * Copyright 2013 Red Hat Inc.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * Authors: Jérôme Glisse <jglisse@redhat.com>
  15. */
  16. /*
  17. * Refer to include/linux/hmm.h for information about heterogeneous memory
  18. * management or HMM for short.
  19. */
  20. #include <linux/mm.h>
  21. #include <linux/hmm.h>
  22. #include <linux/init.h>
  23. #include <linux/rmap.h>
  24. #include <linux/swap.h>
  25. #include <linux/slab.h>
  26. #include <linux/sched.h>
  27. #include <linux/mmzone.h>
  28. #include <linux/pagemap.h>
  29. #include <linux/swapops.h>
  30. #include <linux/hugetlb.h>
  31. #include <linux/memremap.h>
  32. #include <linux/jump_label.h>
  33. #include <linux/mmu_notifier.h>
  34. #include <linux/memory_hotplug.h>
  35. #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
  36. #if IS_ENABLED(CONFIG_HMM_MIRROR)
  37. static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  38. /*
  39. * struct hmm - HMM per mm struct
  40. *
  41. * @mm: mm struct this HMM struct is bound to
  42. * @lock: lock protecting ranges list
  43. * @sequence: we track updates to the CPU page table with a sequence number
  44. * @ranges: list of range being snapshotted
  45. * @mirrors: list of mirrors for this mm
  46. * @mmu_notifier: mmu notifier to track updates to CPU page table
  47. * @mirrors_sem: read/write semaphore protecting the mirrors list
  48. */
  49. struct hmm {
  50. struct mm_struct *mm;
  51. spinlock_t lock;
  52. atomic_t sequence;
  53. struct list_head ranges;
  54. struct list_head mirrors;
  55. struct mmu_notifier mmu_notifier;
  56. struct rw_semaphore mirrors_sem;
  57. };
  58. /*
  59. * hmm_register - register HMM against an mm (HMM internal)
  60. *
  61. * @mm: mm struct to attach to
  62. *
  63. * This is not intended to be used directly by device drivers. It allocates an
  64. * HMM struct if mm does not have one, and initializes it.
  65. */
  66. static struct hmm *hmm_register(struct mm_struct *mm)
  67. {
  68. struct hmm *hmm = READ_ONCE(mm->hmm);
  69. bool cleanup = false;
  70. /*
  71. * The hmm struct can only be freed once the mm_struct goes away,
  72. * hence we should always have pre-allocated an new hmm struct
  73. * above.
  74. */
  75. if (hmm)
  76. return hmm;
  77. hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
  78. if (!hmm)
  79. return NULL;
  80. INIT_LIST_HEAD(&hmm->mirrors);
  81. init_rwsem(&hmm->mirrors_sem);
  82. atomic_set(&hmm->sequence, 0);
  83. hmm->mmu_notifier.ops = NULL;
  84. INIT_LIST_HEAD(&hmm->ranges);
  85. spin_lock_init(&hmm->lock);
  86. hmm->mm = mm;
  87. /*
  88. * We should only get here if hold the mmap_sem in write mode ie on
  89. * registration of first mirror through hmm_mirror_register()
  90. */
  91. hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
  92. if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
  93. kfree(hmm);
  94. return NULL;
  95. }
  96. spin_lock(&mm->page_table_lock);
  97. if (!mm->hmm)
  98. mm->hmm = hmm;
  99. else
  100. cleanup = true;
  101. spin_unlock(&mm->page_table_lock);
  102. if (cleanup) {
  103. mmu_notifier_unregister(&hmm->mmu_notifier, mm);
  104. kfree(hmm);
  105. }
  106. return mm->hmm;
  107. }
  108. void hmm_mm_destroy(struct mm_struct *mm)
  109. {
  110. kfree(mm->hmm);
  111. }
  112. static void hmm_invalidate_range(struct hmm *hmm,
  113. enum hmm_update_type action,
  114. unsigned long start,
  115. unsigned long end)
  116. {
  117. struct hmm_mirror *mirror;
  118. struct hmm_range *range;
  119. spin_lock(&hmm->lock);
  120. list_for_each_entry(range, &hmm->ranges, list) {
  121. unsigned long addr, idx, npages;
  122. if (end < range->start || start >= range->end)
  123. continue;
  124. range->valid = false;
  125. addr = max(start, range->start);
  126. idx = (addr - range->start) >> PAGE_SHIFT;
  127. npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
  128. memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
  129. }
  130. spin_unlock(&hmm->lock);
  131. down_read(&hmm->mirrors_sem);
  132. list_for_each_entry(mirror, &hmm->mirrors, list)
  133. mirror->ops->sync_cpu_device_pagetables(mirror, action,
  134. start, end);
  135. up_read(&hmm->mirrors_sem);
  136. }
  137. static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
  138. {
  139. struct hmm_mirror *mirror;
  140. struct hmm *hmm = mm->hmm;
  141. down_write(&hmm->mirrors_sem);
  142. mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
  143. list);
  144. while (mirror) {
  145. list_del_init(&mirror->list);
  146. if (mirror->ops->release) {
  147. /*
  148. * Drop mirrors_sem so callback can wait on any pending
  149. * work that might itself trigger mmu_notifier callback
  150. * and thus would deadlock with us.
  151. */
  152. up_write(&hmm->mirrors_sem);
  153. mirror->ops->release(mirror);
  154. down_write(&hmm->mirrors_sem);
  155. }
  156. mirror = list_first_entry_or_null(&hmm->mirrors,
  157. struct hmm_mirror, list);
  158. }
  159. up_write(&hmm->mirrors_sem);
  160. }
  161. static void hmm_invalidate_range_start(struct mmu_notifier *mn,
  162. struct mm_struct *mm,
  163. unsigned long start,
  164. unsigned long end)
  165. {
  166. struct hmm *hmm = mm->hmm;
  167. VM_BUG_ON(!hmm);
  168. atomic_inc(&hmm->sequence);
  169. }
  170. static void hmm_invalidate_range_end(struct mmu_notifier *mn,
  171. struct mm_struct *mm,
  172. unsigned long start,
  173. unsigned long end)
  174. {
  175. struct hmm *hmm = mm->hmm;
  176. VM_BUG_ON(!hmm);
  177. hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
  178. }
  179. static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  180. .release = hmm_release,
  181. .invalidate_range_start = hmm_invalidate_range_start,
  182. .invalidate_range_end = hmm_invalidate_range_end,
  183. };
  184. /*
  185. * hmm_mirror_register() - register a mirror against an mm
  186. *
  187. * @mirror: new mirror struct to register
  188. * @mm: mm to register against
  189. *
  190. * To start mirroring a process address space, the device driver must register
  191. * an HMM mirror struct.
  192. *
  193. * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
  194. */
  195. int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
  196. {
  197. /* Sanity check */
  198. if (!mm || !mirror || !mirror->ops)
  199. return -EINVAL;
  200. again:
  201. mirror->hmm = hmm_register(mm);
  202. if (!mirror->hmm)
  203. return -ENOMEM;
  204. down_write(&mirror->hmm->mirrors_sem);
  205. if (mirror->hmm->mm == NULL) {
  206. /*
  207. * A racing hmm_mirror_unregister() is about to destroy the hmm
  208. * struct. Try again to allocate a new one.
  209. */
  210. up_write(&mirror->hmm->mirrors_sem);
  211. mirror->hmm = NULL;
  212. goto again;
  213. } else {
  214. list_add(&mirror->list, &mirror->hmm->mirrors);
  215. up_write(&mirror->hmm->mirrors_sem);
  216. }
  217. return 0;
  218. }
  219. EXPORT_SYMBOL(hmm_mirror_register);
  220. /*
  221. * hmm_mirror_unregister() - unregister a mirror
  222. *
  223. * @mirror: new mirror struct to register
  224. *
  225. * Stop mirroring a process address space, and cleanup.
  226. */
  227. void hmm_mirror_unregister(struct hmm_mirror *mirror)
  228. {
  229. bool should_unregister = false;
  230. struct mm_struct *mm;
  231. struct hmm *hmm;
  232. if (mirror->hmm == NULL)
  233. return;
  234. hmm = mirror->hmm;
  235. down_write(&hmm->mirrors_sem);
  236. list_del_init(&mirror->list);
  237. should_unregister = list_empty(&hmm->mirrors);
  238. mirror->hmm = NULL;
  239. mm = hmm->mm;
  240. hmm->mm = NULL;
  241. up_write(&hmm->mirrors_sem);
  242. if (!should_unregister || mm == NULL)
  243. return;
  244. spin_lock(&mm->page_table_lock);
  245. if (mm->hmm == hmm)
  246. mm->hmm = NULL;
  247. spin_unlock(&mm->page_table_lock);
  248. mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
  249. kfree(hmm);
  250. }
  251. EXPORT_SYMBOL(hmm_mirror_unregister);
  252. struct hmm_vma_walk {
  253. struct hmm_range *range;
  254. unsigned long last;
  255. bool fault;
  256. bool block;
  257. };
  258. static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
  259. bool write_fault, uint64_t *pfn)
  260. {
  261. unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
  262. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  263. struct hmm_range *range = hmm_vma_walk->range;
  264. struct vm_area_struct *vma = walk->vma;
  265. int r;
  266. flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
  267. flags |= write_fault ? FAULT_FLAG_WRITE : 0;
  268. r = handle_mm_fault(vma, addr, flags);
  269. if (r & VM_FAULT_RETRY)
  270. return -EBUSY;
  271. if (r & VM_FAULT_ERROR) {
  272. *pfn = range->values[HMM_PFN_ERROR];
  273. return -EFAULT;
  274. }
  275. return -EAGAIN;
  276. }
  277. static int hmm_pfns_bad(unsigned long addr,
  278. unsigned long end,
  279. struct mm_walk *walk)
  280. {
  281. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  282. struct hmm_range *range = hmm_vma_walk->range;
  283. uint64_t *pfns = range->pfns;
  284. unsigned long i;
  285. i = (addr - range->start) >> PAGE_SHIFT;
  286. for (; addr < end; addr += PAGE_SIZE, i++)
  287. pfns[i] = range->values[HMM_PFN_ERROR];
  288. return 0;
  289. }
  290. /*
  291. * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
  292. * @start: range virtual start address (inclusive)
  293. * @end: range virtual end address (exclusive)
  294. * @fault: should we fault or not ?
  295. * @write_fault: write fault ?
  296. * @walk: mm_walk structure
  297. * Returns: 0 on success, -EAGAIN after page fault, or page fault error
  298. *
  299. * This function will be called whenever pmd_none() or pte_none() returns true,
  300. * or whenever there is no page directory covering the virtual address range.
  301. */
  302. static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
  303. bool fault, bool write_fault,
  304. struct mm_walk *walk)
  305. {
  306. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  307. struct hmm_range *range = hmm_vma_walk->range;
  308. uint64_t *pfns = range->pfns;
  309. unsigned long i;
  310. hmm_vma_walk->last = addr;
  311. i = (addr - range->start) >> PAGE_SHIFT;
  312. for (; addr < end; addr += PAGE_SIZE, i++) {
  313. pfns[i] = range->values[HMM_PFN_NONE];
  314. if (fault || write_fault) {
  315. int ret;
  316. ret = hmm_vma_do_fault(walk, addr, write_fault,
  317. &pfns[i]);
  318. if (ret != -EAGAIN)
  319. return ret;
  320. }
  321. }
  322. return (fault || write_fault) ? -EAGAIN : 0;
  323. }
  324. static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  325. uint64_t pfns, uint64_t cpu_flags,
  326. bool *fault, bool *write_fault)
  327. {
  328. struct hmm_range *range = hmm_vma_walk->range;
  329. *fault = *write_fault = false;
  330. if (!hmm_vma_walk->fault)
  331. return;
  332. /* We aren't ask to do anything ... */
  333. if (!(pfns & range->flags[HMM_PFN_VALID]))
  334. return;
  335. /* If this is device memory than only fault if explicitly requested */
  336. if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
  337. /* Do we fault on device memory ? */
  338. if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
  339. *write_fault = pfns & range->flags[HMM_PFN_WRITE];
  340. *fault = true;
  341. }
  342. return;
  343. }
  344. /* If CPU page table is not valid then we need to fault */
  345. *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
  346. /* Need to write fault ? */
  347. if ((pfns & range->flags[HMM_PFN_WRITE]) &&
  348. !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
  349. *write_fault = true;
  350. *fault = true;
  351. }
  352. }
  353. static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  354. const uint64_t *pfns, unsigned long npages,
  355. uint64_t cpu_flags, bool *fault,
  356. bool *write_fault)
  357. {
  358. unsigned long i;
  359. if (!hmm_vma_walk->fault) {
  360. *fault = *write_fault = false;
  361. return;
  362. }
  363. for (i = 0; i < npages; ++i) {
  364. hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
  365. fault, write_fault);
  366. if ((*fault) || (*write_fault))
  367. return;
  368. }
  369. }
  370. static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
  371. struct mm_walk *walk)
  372. {
  373. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  374. struct hmm_range *range = hmm_vma_walk->range;
  375. bool fault, write_fault;
  376. unsigned long i, npages;
  377. uint64_t *pfns;
  378. i = (addr - range->start) >> PAGE_SHIFT;
  379. npages = (end - addr) >> PAGE_SHIFT;
  380. pfns = &range->pfns[i];
  381. hmm_range_need_fault(hmm_vma_walk, pfns, npages,
  382. 0, &fault, &write_fault);
  383. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  384. }
  385. static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
  386. {
  387. if (pmd_protnone(pmd))
  388. return 0;
  389. return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
  390. range->flags[HMM_PFN_WRITE] :
  391. range->flags[HMM_PFN_VALID];
  392. }
  393. static int hmm_vma_handle_pmd(struct mm_walk *walk,
  394. unsigned long addr,
  395. unsigned long end,
  396. uint64_t *pfns,
  397. pmd_t pmd)
  398. {
  399. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  400. struct hmm_range *range = hmm_vma_walk->range;
  401. unsigned long pfn, npages, i;
  402. bool fault, write_fault;
  403. uint64_t cpu_flags;
  404. npages = (end - addr) >> PAGE_SHIFT;
  405. cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
  406. hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
  407. &fault, &write_fault);
  408. if (pmd_protnone(pmd) || fault || write_fault)
  409. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  410. pfn = pmd_pfn(pmd) + pte_index(addr);
  411. for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
  412. pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
  413. hmm_vma_walk->last = end;
  414. return 0;
  415. }
  416. static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
  417. {
  418. if (pte_none(pte) || !pte_present(pte))
  419. return 0;
  420. return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
  421. range->flags[HMM_PFN_WRITE] :
  422. range->flags[HMM_PFN_VALID];
  423. }
  424. static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
  425. unsigned long end, pmd_t *pmdp, pte_t *ptep,
  426. uint64_t *pfn)
  427. {
  428. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  429. struct hmm_range *range = hmm_vma_walk->range;
  430. struct vm_area_struct *vma = walk->vma;
  431. bool fault, write_fault;
  432. uint64_t cpu_flags;
  433. pte_t pte = *ptep;
  434. uint64_t orig_pfn = *pfn;
  435. *pfn = range->values[HMM_PFN_NONE];
  436. cpu_flags = pte_to_hmm_pfn_flags(range, pte);
  437. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  438. &fault, &write_fault);
  439. if (pte_none(pte)) {
  440. if (fault || write_fault)
  441. goto fault;
  442. return 0;
  443. }
  444. if (!pte_present(pte)) {
  445. swp_entry_t entry = pte_to_swp_entry(pte);
  446. if (!non_swap_entry(entry)) {
  447. if (fault || write_fault)
  448. goto fault;
  449. return 0;
  450. }
  451. /*
  452. * This is a special swap entry, ignore migration, use
  453. * device and report anything else as error.
  454. */
  455. if (is_device_private_entry(entry)) {
  456. cpu_flags = range->flags[HMM_PFN_VALID] |
  457. range->flags[HMM_PFN_DEVICE_PRIVATE];
  458. cpu_flags |= is_write_device_private_entry(entry) ?
  459. range->flags[HMM_PFN_WRITE] : 0;
  460. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  461. &fault, &write_fault);
  462. if (fault || write_fault)
  463. goto fault;
  464. *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
  465. *pfn |= cpu_flags;
  466. return 0;
  467. }
  468. if (is_migration_entry(entry)) {
  469. if (fault || write_fault) {
  470. pte_unmap(ptep);
  471. hmm_vma_walk->last = addr;
  472. migration_entry_wait(vma->vm_mm,
  473. pmdp, addr);
  474. return -EAGAIN;
  475. }
  476. return 0;
  477. }
  478. /* Report error for everything else */
  479. *pfn = range->values[HMM_PFN_ERROR];
  480. return -EFAULT;
  481. }
  482. if (fault || write_fault)
  483. goto fault;
  484. *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
  485. return 0;
  486. fault:
  487. pte_unmap(ptep);
  488. /* Fault any virtual address we were asked to fault */
  489. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  490. }
  491. static int hmm_vma_walk_pmd(pmd_t *pmdp,
  492. unsigned long start,
  493. unsigned long end,
  494. struct mm_walk *walk)
  495. {
  496. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  497. struct hmm_range *range = hmm_vma_walk->range;
  498. uint64_t *pfns = range->pfns;
  499. unsigned long addr = start, i;
  500. pte_t *ptep;
  501. i = (addr - range->start) >> PAGE_SHIFT;
  502. again:
  503. if (pmd_none(*pmdp))
  504. return hmm_vma_walk_hole(start, end, walk);
  505. if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
  506. return hmm_pfns_bad(start, end, walk);
  507. if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
  508. pmd_t pmd;
  509. /*
  510. * No need to take pmd_lock here, even if some other threads
  511. * is splitting the huge pmd we will get that event through
  512. * mmu_notifier callback.
  513. *
  514. * So just read pmd value and check again its a transparent
  515. * huge or device mapping one and compute corresponding pfn
  516. * values.
  517. */
  518. pmd = pmd_read_atomic(pmdp);
  519. barrier();
  520. if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
  521. goto again;
  522. return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
  523. }
  524. if (pmd_bad(*pmdp))
  525. return hmm_pfns_bad(start, end, walk);
  526. ptep = pte_offset_map(pmdp, addr);
  527. for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
  528. int r;
  529. r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
  530. if (r) {
  531. /* hmm_vma_handle_pte() did unmap pte directory */
  532. hmm_vma_walk->last = addr;
  533. return r;
  534. }
  535. }
  536. pte_unmap(ptep - 1);
  537. hmm_vma_walk->last = addr;
  538. return 0;
  539. }
  540. static void hmm_pfns_clear(struct hmm_range *range,
  541. uint64_t *pfns,
  542. unsigned long addr,
  543. unsigned long end)
  544. {
  545. for (; addr < end; addr += PAGE_SIZE, pfns++)
  546. *pfns = range->values[HMM_PFN_NONE];
  547. }
  548. static void hmm_pfns_special(struct hmm_range *range)
  549. {
  550. unsigned long addr = range->start, i = 0;
  551. for (; addr < range->end; addr += PAGE_SIZE, i++)
  552. range->pfns[i] = range->values[HMM_PFN_SPECIAL];
  553. }
  554. /*
  555. * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
  556. * @range: range being snapshotted
  557. * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  558. * vma permission, 0 success
  559. *
  560. * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  561. * validity is tracked by range struct. See hmm_vma_range_done() for further
  562. * information.
  563. *
  564. * The range struct is initialized here. It tracks the CPU page table, but only
  565. * if the function returns success (0), in which case the caller must then call
  566. * hmm_vma_range_done() to stop CPU page table update tracking on this range.
  567. *
  568. * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
  569. * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
  570. */
  571. int hmm_vma_get_pfns(struct hmm_range *range)
  572. {
  573. struct vm_area_struct *vma = range->vma;
  574. struct hmm_vma_walk hmm_vma_walk;
  575. struct mm_walk mm_walk;
  576. struct hmm *hmm;
  577. /* Sanity check, this really should not happen ! */
  578. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  579. return -EINVAL;
  580. if (range->end < vma->vm_start || range->end > vma->vm_end)
  581. return -EINVAL;
  582. hmm = hmm_register(vma->vm_mm);
  583. if (!hmm)
  584. return -ENOMEM;
  585. /* Caller must have registered a mirror, via hmm_mirror_register() ! */
  586. if (!hmm->mmu_notifier.ops)
  587. return -EINVAL;
  588. /* FIXME support hugetlb fs */
  589. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
  590. hmm_pfns_special(range);
  591. return -EINVAL;
  592. }
  593. if (!(vma->vm_flags & VM_READ)) {
  594. /*
  595. * If vma do not allow read access, then assume that it does
  596. * not allow write access, either. Architecture that allow
  597. * write without read access are not supported by HMM, because
  598. * operations such has atomic access would not work.
  599. */
  600. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  601. return -EPERM;
  602. }
  603. /* Initialize range to track CPU page table update */
  604. spin_lock(&hmm->lock);
  605. range->valid = true;
  606. list_add_rcu(&range->list, &hmm->ranges);
  607. spin_unlock(&hmm->lock);
  608. hmm_vma_walk.fault = false;
  609. hmm_vma_walk.range = range;
  610. mm_walk.private = &hmm_vma_walk;
  611. mm_walk.vma = vma;
  612. mm_walk.mm = vma->vm_mm;
  613. mm_walk.pte_entry = NULL;
  614. mm_walk.test_walk = NULL;
  615. mm_walk.hugetlb_entry = NULL;
  616. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  617. mm_walk.pte_hole = hmm_vma_walk_hole;
  618. walk_page_range(range->start, range->end, &mm_walk);
  619. return 0;
  620. }
  621. EXPORT_SYMBOL(hmm_vma_get_pfns);
  622. /*
  623. * hmm_vma_range_done() - stop tracking change to CPU page table over a range
  624. * @range: range being tracked
  625. * Returns: false if range data has been invalidated, true otherwise
  626. *
  627. * Range struct is used to track updates to the CPU page table after a call to
  628. * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
  629. * using the data, or wants to lock updates to the data it got from those
  630. * functions, it must call the hmm_vma_range_done() function, which will then
  631. * stop tracking CPU page table updates.
  632. *
  633. * Note that device driver must still implement general CPU page table update
  634. * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
  635. * the mmu_notifier API directly.
  636. *
  637. * CPU page table update tracking done through hmm_range is only temporary and
  638. * to be used while trying to duplicate CPU page table contents for a range of
  639. * virtual addresses.
  640. *
  641. * There are two ways to use this :
  642. * again:
  643. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  644. * trans = device_build_page_table_update_transaction(pfns);
  645. * device_page_table_lock();
  646. * if (!hmm_vma_range_done(range)) {
  647. * device_page_table_unlock();
  648. * goto again;
  649. * }
  650. * device_commit_transaction(trans);
  651. * device_page_table_unlock();
  652. *
  653. * Or:
  654. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  655. * device_page_table_lock();
  656. * hmm_vma_range_done(range);
  657. * device_update_page_table(range->pfns);
  658. * device_page_table_unlock();
  659. */
  660. bool hmm_vma_range_done(struct hmm_range *range)
  661. {
  662. unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
  663. struct hmm *hmm;
  664. if (range->end <= range->start) {
  665. BUG();
  666. return false;
  667. }
  668. hmm = hmm_register(range->vma->vm_mm);
  669. if (!hmm) {
  670. memset(range->pfns, 0, sizeof(*range->pfns) * npages);
  671. return false;
  672. }
  673. spin_lock(&hmm->lock);
  674. list_del_rcu(&range->list);
  675. spin_unlock(&hmm->lock);
  676. return range->valid;
  677. }
  678. EXPORT_SYMBOL(hmm_vma_range_done);
  679. /*
  680. * hmm_vma_fault() - try to fault some address in a virtual address range
  681. * @range: range being faulted
  682. * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
  683. * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
  684. *
  685. * This is similar to a regular CPU page fault except that it will not trigger
  686. * any memory migration if the memory being faulted is not accessible by CPUs.
  687. *
  688. * On error, for one virtual address in the range, the function will mark the
  689. * corresponding HMM pfn entry with an error flag.
  690. *
  691. * Expected use pattern:
  692. * retry:
  693. * down_read(&mm->mmap_sem);
  694. * // Find vma and address device wants to fault, initialize hmm_pfn_t
  695. * // array accordingly
  696. * ret = hmm_vma_fault(range, write, block);
  697. * switch (ret) {
  698. * case -EAGAIN:
  699. * hmm_vma_range_done(range);
  700. * // You might want to rate limit or yield to play nicely, you may
  701. * // also commit any valid pfn in the array assuming that you are
  702. * // getting true from hmm_vma_range_monitor_end()
  703. * goto retry;
  704. * case 0:
  705. * break;
  706. * case -ENOMEM:
  707. * case -EINVAL:
  708. * case -EPERM:
  709. * default:
  710. * // Handle error !
  711. * up_read(&mm->mmap_sem)
  712. * return;
  713. * }
  714. * // Take device driver lock that serialize device page table update
  715. * driver_lock_device_page_table_update();
  716. * hmm_vma_range_done(range);
  717. * // Commit pfns we got from hmm_vma_fault()
  718. * driver_unlock_device_page_table_update();
  719. * up_read(&mm->mmap_sem)
  720. *
  721. * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
  722. * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
  723. *
  724. * YOU HAVE BEEN WARNED !
  725. */
  726. int hmm_vma_fault(struct hmm_range *range, bool block)
  727. {
  728. struct vm_area_struct *vma = range->vma;
  729. unsigned long start = range->start;
  730. struct hmm_vma_walk hmm_vma_walk;
  731. struct mm_walk mm_walk;
  732. struct hmm *hmm;
  733. int ret;
  734. /* Sanity check, this really should not happen ! */
  735. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  736. return -EINVAL;
  737. if (range->end < vma->vm_start || range->end > vma->vm_end)
  738. return -EINVAL;
  739. hmm = hmm_register(vma->vm_mm);
  740. if (!hmm) {
  741. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  742. return -ENOMEM;
  743. }
  744. /* Caller must have registered a mirror using hmm_mirror_register() */
  745. if (!hmm->mmu_notifier.ops)
  746. return -EINVAL;
  747. /* FIXME support hugetlb fs */
  748. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
  749. hmm_pfns_special(range);
  750. return -EINVAL;
  751. }
  752. if (!(vma->vm_flags & VM_READ)) {
  753. /*
  754. * If vma do not allow read access, then assume that it does
  755. * not allow write access, either. Architecture that allow
  756. * write without read access are not supported by HMM, because
  757. * operations such has atomic access would not work.
  758. */
  759. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  760. return -EPERM;
  761. }
  762. /* Initialize range to track CPU page table update */
  763. spin_lock(&hmm->lock);
  764. range->valid = true;
  765. list_add_rcu(&range->list, &hmm->ranges);
  766. spin_unlock(&hmm->lock);
  767. hmm_vma_walk.fault = true;
  768. hmm_vma_walk.block = block;
  769. hmm_vma_walk.range = range;
  770. mm_walk.private = &hmm_vma_walk;
  771. hmm_vma_walk.last = range->start;
  772. mm_walk.vma = vma;
  773. mm_walk.mm = vma->vm_mm;
  774. mm_walk.pte_entry = NULL;
  775. mm_walk.test_walk = NULL;
  776. mm_walk.hugetlb_entry = NULL;
  777. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  778. mm_walk.pte_hole = hmm_vma_walk_hole;
  779. do {
  780. ret = walk_page_range(start, range->end, &mm_walk);
  781. start = hmm_vma_walk.last;
  782. } while (ret == -EAGAIN);
  783. if (ret) {
  784. unsigned long i;
  785. i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
  786. hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
  787. range->end);
  788. hmm_vma_range_done(range);
  789. }
  790. return ret;
  791. }
  792. EXPORT_SYMBOL(hmm_vma_fault);
  793. #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
  794. #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
  795. struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
  796. unsigned long addr)
  797. {
  798. struct page *page;
  799. page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
  800. if (!page)
  801. return NULL;
  802. lock_page(page);
  803. return page;
  804. }
  805. EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
  806. static void hmm_devmem_ref_release(struct percpu_ref *ref)
  807. {
  808. struct hmm_devmem *devmem;
  809. devmem = container_of(ref, struct hmm_devmem, ref);
  810. complete(&devmem->completion);
  811. }
  812. static void hmm_devmem_ref_exit(void *data)
  813. {
  814. struct percpu_ref *ref = data;
  815. struct hmm_devmem *devmem;
  816. devmem = container_of(ref, struct hmm_devmem, ref);
  817. percpu_ref_exit(ref);
  818. devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
  819. }
  820. static void hmm_devmem_ref_kill(void *data)
  821. {
  822. struct percpu_ref *ref = data;
  823. struct hmm_devmem *devmem;
  824. devmem = container_of(ref, struct hmm_devmem, ref);
  825. percpu_ref_kill(ref);
  826. wait_for_completion(&devmem->completion);
  827. devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
  828. }
  829. static int hmm_devmem_fault(struct vm_area_struct *vma,
  830. unsigned long addr,
  831. const struct page *page,
  832. unsigned int flags,
  833. pmd_t *pmdp)
  834. {
  835. struct hmm_devmem *devmem = page->pgmap->data;
  836. return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
  837. }
  838. static void hmm_devmem_free(struct page *page, void *data)
  839. {
  840. struct hmm_devmem *devmem = data;
  841. devmem->ops->free(devmem, page);
  842. }
  843. static DEFINE_MUTEX(hmm_devmem_lock);
  844. static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
  845. static void hmm_devmem_radix_release(struct resource *resource)
  846. {
  847. resource_size_t key, align_start, align_size;
  848. align_start = resource->start & ~(PA_SECTION_SIZE - 1);
  849. align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
  850. mutex_lock(&hmm_devmem_lock);
  851. for (key = resource->start;
  852. key <= resource->end;
  853. key += PA_SECTION_SIZE)
  854. radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
  855. mutex_unlock(&hmm_devmem_lock);
  856. }
  857. static void hmm_devmem_release(struct device *dev, void *data)
  858. {
  859. struct hmm_devmem *devmem = data;
  860. struct resource *resource = devmem->resource;
  861. unsigned long start_pfn, npages;
  862. struct zone *zone;
  863. struct page *page;
  864. if (percpu_ref_tryget_live(&devmem->ref)) {
  865. dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
  866. percpu_ref_put(&devmem->ref);
  867. }
  868. /* pages are dead and unused, undo the arch mapping */
  869. start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
  870. npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
  871. page = pfn_to_page(start_pfn);
  872. zone = page_zone(page);
  873. mem_hotplug_begin();
  874. if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
  875. __remove_pages(zone, start_pfn, npages, NULL);
  876. else
  877. arch_remove_memory(start_pfn << PAGE_SHIFT,
  878. npages << PAGE_SHIFT, NULL);
  879. mem_hotplug_done();
  880. hmm_devmem_radix_release(resource);
  881. }
  882. static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
  883. {
  884. resource_size_t key, align_start, align_size, align_end;
  885. struct device *device = devmem->device;
  886. int ret, nid, is_ram;
  887. unsigned long pfn;
  888. align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
  889. align_size = ALIGN(devmem->resource->start +
  890. resource_size(devmem->resource),
  891. PA_SECTION_SIZE) - align_start;
  892. is_ram = region_intersects(align_start, align_size,
  893. IORESOURCE_SYSTEM_RAM,
  894. IORES_DESC_NONE);
  895. if (is_ram == REGION_MIXED) {
  896. WARN_ONCE(1, "%s attempted on mixed region %pr\n",
  897. __func__, devmem->resource);
  898. return -ENXIO;
  899. }
  900. if (is_ram == REGION_INTERSECTS)
  901. return -ENXIO;
  902. if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
  903. devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
  904. else
  905. devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
  906. devmem->pagemap.res = *devmem->resource;
  907. devmem->pagemap.page_fault = hmm_devmem_fault;
  908. devmem->pagemap.page_free = hmm_devmem_free;
  909. devmem->pagemap.dev = devmem->device;
  910. devmem->pagemap.ref = &devmem->ref;
  911. devmem->pagemap.data = devmem;
  912. mutex_lock(&hmm_devmem_lock);
  913. align_end = align_start + align_size - 1;
  914. for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
  915. struct hmm_devmem *dup;
  916. dup = radix_tree_lookup(&hmm_devmem_radix,
  917. key >> PA_SECTION_SHIFT);
  918. if (dup) {
  919. dev_err(device, "%s: collides with mapping for %s\n",
  920. __func__, dev_name(dup->device));
  921. mutex_unlock(&hmm_devmem_lock);
  922. ret = -EBUSY;
  923. goto error;
  924. }
  925. ret = radix_tree_insert(&hmm_devmem_radix,
  926. key >> PA_SECTION_SHIFT,
  927. devmem);
  928. if (ret) {
  929. dev_err(device, "%s: failed: %d\n", __func__, ret);
  930. mutex_unlock(&hmm_devmem_lock);
  931. goto error_radix;
  932. }
  933. }
  934. mutex_unlock(&hmm_devmem_lock);
  935. nid = dev_to_node(device);
  936. if (nid < 0)
  937. nid = numa_mem_id();
  938. mem_hotplug_begin();
  939. /*
  940. * For device private memory we call add_pages() as we only need to
  941. * allocate and initialize struct page for the device memory. More-
  942. * over the device memory is un-accessible thus we do not want to
  943. * create a linear mapping for the memory like arch_add_memory()
  944. * would do.
  945. *
  946. * For device public memory, which is accesible by the CPU, we do
  947. * want the linear mapping and thus use arch_add_memory().
  948. */
  949. if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
  950. ret = arch_add_memory(nid, align_start, align_size, NULL,
  951. false);
  952. else
  953. ret = add_pages(nid, align_start >> PAGE_SHIFT,
  954. align_size >> PAGE_SHIFT, NULL, false);
  955. if (ret) {
  956. mem_hotplug_done();
  957. goto error_add_memory;
  958. }
  959. move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
  960. align_start >> PAGE_SHIFT,
  961. align_size >> PAGE_SHIFT, NULL);
  962. mem_hotplug_done();
  963. for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
  964. struct page *page = pfn_to_page(pfn);
  965. page->pgmap = &devmem->pagemap;
  966. }
  967. return 0;
  968. error_add_memory:
  969. untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
  970. error_radix:
  971. hmm_devmem_radix_release(devmem->resource);
  972. error:
  973. return ret;
  974. }
  975. static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
  976. {
  977. struct hmm_devmem *devmem = data;
  978. return devmem->resource == match_data;
  979. }
  980. static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
  981. {
  982. devres_release(devmem->device, &hmm_devmem_release,
  983. &hmm_devmem_match, devmem->resource);
  984. }
  985. /*
  986. * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  987. *
  988. * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  989. * @device: device struct to bind the resource too
  990. * @size: size in bytes of the device memory to add
  991. * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
  992. *
  993. * This function first finds an empty range of physical address big enough to
  994. * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
  995. * in turn allocates struct pages. It does not do anything beyond that; all
  996. * events affecting the memory will go through the various callbacks provided
  997. * by hmm_devmem_ops struct.
  998. *
  999. * Device driver should call this function during device initialization and
  1000. * is then responsible of memory management. HMM only provides helpers.
  1001. */
  1002. struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  1003. struct device *device,
  1004. unsigned long size)
  1005. {
  1006. struct hmm_devmem *devmem;
  1007. resource_size_t addr;
  1008. int ret;
  1009. dev_pagemap_get_ops();
  1010. devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
  1011. GFP_KERNEL, dev_to_node(device));
  1012. if (!devmem)
  1013. return ERR_PTR(-ENOMEM);
  1014. init_completion(&devmem->completion);
  1015. devmem->pfn_first = -1UL;
  1016. devmem->pfn_last = -1UL;
  1017. devmem->resource = NULL;
  1018. devmem->device = device;
  1019. devmem->ops = ops;
  1020. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1021. 0, GFP_KERNEL);
  1022. if (ret)
  1023. goto error_percpu_ref;
  1024. ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
  1025. if (ret)
  1026. goto error_devm_add_action;
  1027. size = ALIGN(size, PA_SECTION_SIZE);
  1028. addr = min((unsigned long)iomem_resource.end,
  1029. (1UL << MAX_PHYSMEM_BITS) - 1);
  1030. addr = addr - size + 1UL;
  1031. /*
  1032. * FIXME add a new helper to quickly walk resource tree and find free
  1033. * range
  1034. *
  1035. * FIXME what about ioport_resource resource ?
  1036. */
  1037. for (; addr > size && addr >= iomem_resource.start; addr -= size) {
  1038. ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
  1039. if (ret != REGION_DISJOINT)
  1040. continue;
  1041. devmem->resource = devm_request_mem_region(device, addr, size,
  1042. dev_name(device));
  1043. if (!devmem->resource) {
  1044. ret = -ENOMEM;
  1045. goto error_no_resource;
  1046. }
  1047. break;
  1048. }
  1049. if (!devmem->resource) {
  1050. ret = -ERANGE;
  1051. goto error_no_resource;
  1052. }
  1053. devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
  1054. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1055. devmem->pfn_last = devmem->pfn_first +
  1056. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1057. ret = hmm_devmem_pages_create(devmem);
  1058. if (ret)
  1059. goto error_pages;
  1060. devres_add(device, devmem);
  1061. ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
  1062. if (ret) {
  1063. hmm_devmem_remove(devmem);
  1064. return ERR_PTR(ret);
  1065. }
  1066. return devmem;
  1067. error_pages:
  1068. devm_release_mem_region(device, devmem->resource->start,
  1069. resource_size(devmem->resource));
  1070. error_no_resource:
  1071. error_devm_add_action:
  1072. hmm_devmem_ref_kill(&devmem->ref);
  1073. hmm_devmem_ref_exit(&devmem->ref);
  1074. error_percpu_ref:
  1075. devres_free(devmem);
  1076. return ERR_PTR(ret);
  1077. }
  1078. EXPORT_SYMBOL(hmm_devmem_add);
  1079. struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
  1080. struct device *device,
  1081. struct resource *res)
  1082. {
  1083. struct hmm_devmem *devmem;
  1084. int ret;
  1085. if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
  1086. return ERR_PTR(-EINVAL);
  1087. dev_pagemap_get_ops();
  1088. devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
  1089. GFP_KERNEL, dev_to_node(device));
  1090. if (!devmem)
  1091. return ERR_PTR(-ENOMEM);
  1092. init_completion(&devmem->completion);
  1093. devmem->pfn_first = -1UL;
  1094. devmem->pfn_last = -1UL;
  1095. devmem->resource = res;
  1096. devmem->device = device;
  1097. devmem->ops = ops;
  1098. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1099. 0, GFP_KERNEL);
  1100. if (ret)
  1101. goto error_percpu_ref;
  1102. ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
  1103. if (ret)
  1104. goto error_devm_add_action;
  1105. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1106. devmem->pfn_last = devmem->pfn_first +
  1107. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1108. ret = hmm_devmem_pages_create(devmem);
  1109. if (ret)
  1110. goto error_devm_add_action;
  1111. devres_add(device, devmem);
  1112. ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
  1113. if (ret) {
  1114. hmm_devmem_remove(devmem);
  1115. return ERR_PTR(ret);
  1116. }
  1117. return devmem;
  1118. error_devm_add_action:
  1119. hmm_devmem_ref_kill(&devmem->ref);
  1120. hmm_devmem_ref_exit(&devmem->ref);
  1121. error_percpu_ref:
  1122. devres_free(devmem);
  1123. return ERR_PTR(ret);
  1124. }
  1125. EXPORT_SYMBOL(hmm_devmem_add_resource);
  1126. /*
  1127. * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
  1128. *
  1129. * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
  1130. *
  1131. * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
  1132. * of the device driver. It will free struct page and remove the resource that
  1133. * reserved the physical address range for this device memory.
  1134. */
  1135. void hmm_devmem_remove(struct hmm_devmem *devmem)
  1136. {
  1137. resource_size_t start, size;
  1138. struct device *device;
  1139. bool cdm = false;
  1140. if (!devmem)
  1141. return;
  1142. device = devmem->device;
  1143. start = devmem->resource->start;
  1144. size = resource_size(devmem->resource);
  1145. cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
  1146. hmm_devmem_ref_kill(&devmem->ref);
  1147. hmm_devmem_ref_exit(&devmem->ref);
  1148. hmm_devmem_pages_remove(devmem);
  1149. if (!cdm)
  1150. devm_release_mem_region(device, start, size);
  1151. }
  1152. EXPORT_SYMBOL(hmm_devmem_remove);
  1153. /*
  1154. * A device driver that wants to handle multiple devices memory through a
  1155. * single fake device can use hmm_device to do so. This is purely a helper
  1156. * and it is not needed to make use of any HMM functionality.
  1157. */
  1158. #define HMM_DEVICE_MAX 256
  1159. static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
  1160. static DEFINE_SPINLOCK(hmm_device_lock);
  1161. static struct class *hmm_device_class;
  1162. static dev_t hmm_device_devt;
  1163. static void hmm_device_release(struct device *device)
  1164. {
  1165. struct hmm_device *hmm_device;
  1166. hmm_device = container_of(device, struct hmm_device, device);
  1167. spin_lock(&hmm_device_lock);
  1168. clear_bit(hmm_device->minor, hmm_device_mask);
  1169. spin_unlock(&hmm_device_lock);
  1170. kfree(hmm_device);
  1171. }
  1172. struct hmm_device *hmm_device_new(void *drvdata)
  1173. {
  1174. struct hmm_device *hmm_device;
  1175. hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
  1176. if (!hmm_device)
  1177. return ERR_PTR(-ENOMEM);
  1178. spin_lock(&hmm_device_lock);
  1179. hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
  1180. if (hmm_device->minor >= HMM_DEVICE_MAX) {
  1181. spin_unlock(&hmm_device_lock);
  1182. kfree(hmm_device);
  1183. return ERR_PTR(-EBUSY);
  1184. }
  1185. set_bit(hmm_device->minor, hmm_device_mask);
  1186. spin_unlock(&hmm_device_lock);
  1187. dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
  1188. hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
  1189. hmm_device->minor);
  1190. hmm_device->device.release = hmm_device_release;
  1191. dev_set_drvdata(&hmm_device->device, drvdata);
  1192. hmm_device->device.class = hmm_device_class;
  1193. device_initialize(&hmm_device->device);
  1194. return hmm_device;
  1195. }
  1196. EXPORT_SYMBOL(hmm_device_new);
  1197. void hmm_device_put(struct hmm_device *hmm_device)
  1198. {
  1199. put_device(&hmm_device->device);
  1200. }
  1201. EXPORT_SYMBOL(hmm_device_put);
  1202. static int __init hmm_init(void)
  1203. {
  1204. int ret;
  1205. ret = alloc_chrdev_region(&hmm_device_devt, 0,
  1206. HMM_DEVICE_MAX,
  1207. "hmm_device");
  1208. if (ret)
  1209. return ret;
  1210. hmm_device_class = class_create(THIS_MODULE, "hmm_device");
  1211. if (IS_ERR(hmm_device_class)) {
  1212. unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
  1213. return PTR_ERR(hmm_device_class);
  1214. }
  1215. return 0;
  1216. }
  1217. device_initcall(hmm_init);
  1218. #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */