hmm.c 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471
  1. /*
  2. * Copyright 2013 Red Hat Inc.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * Authors: Jérôme Glisse <jglisse@redhat.com>
  15. */
  16. /*
  17. * Refer to include/linux/hmm.h for information about heterogeneous memory
  18. * management or HMM for short.
  19. */
  20. #include <linux/mm.h>
  21. #include <linux/hmm.h>
  22. #include <linux/init.h>
  23. #include <linux/rmap.h>
  24. #include <linux/swap.h>
  25. #include <linux/slab.h>
  26. #include <linux/sched.h>
  27. #include <linux/mmzone.h>
  28. #include <linux/pagemap.h>
  29. #include <linux/swapops.h>
  30. #include <linux/hugetlb.h>
  31. #include <linux/memremap.h>
  32. #include <linux/jump_label.h>
  33. #include <linux/mmu_notifier.h>
  34. #include <linux/memory_hotplug.h>
  35. #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
  36. #if IS_ENABLED(CONFIG_HMM_MIRROR)
  37. static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  38. /*
  39. * struct hmm - HMM per mm struct
  40. *
  41. * @mm: mm struct this HMM struct is bound to
  42. * @lock: lock protecting ranges list
  43. * @ranges: list of range being snapshotted
  44. * @mirrors: list of mirrors for this mm
  45. * @mmu_notifier: mmu notifier to track updates to CPU page table
  46. * @mirrors_sem: read/write semaphore protecting the mirrors list
  47. */
  48. struct hmm {
  49. struct mm_struct *mm;
  50. spinlock_t lock;
  51. struct list_head ranges;
  52. struct list_head mirrors;
  53. struct mmu_notifier mmu_notifier;
  54. struct rw_semaphore mirrors_sem;
  55. };
  56. /*
  57. * hmm_register - register HMM against an mm (HMM internal)
  58. *
  59. * @mm: mm struct to attach to
  60. *
  61. * This is not intended to be used directly by device drivers. It allocates an
  62. * HMM struct if mm does not have one, and initializes it.
  63. */
  64. static struct hmm *hmm_register(struct mm_struct *mm)
  65. {
  66. struct hmm *hmm = READ_ONCE(mm->hmm);
  67. bool cleanup = false;
  68. /*
  69. * The hmm struct can only be freed once the mm_struct goes away,
  70. * hence we should always have pre-allocated an new hmm struct
  71. * above.
  72. */
  73. if (hmm)
  74. return hmm;
  75. hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
  76. if (!hmm)
  77. return NULL;
  78. INIT_LIST_HEAD(&hmm->mirrors);
  79. init_rwsem(&hmm->mirrors_sem);
  80. hmm->mmu_notifier.ops = NULL;
  81. INIT_LIST_HEAD(&hmm->ranges);
  82. spin_lock_init(&hmm->lock);
  83. hmm->mm = mm;
  84. spin_lock(&mm->page_table_lock);
  85. if (!mm->hmm)
  86. mm->hmm = hmm;
  87. else
  88. cleanup = true;
  89. spin_unlock(&mm->page_table_lock);
  90. if (cleanup)
  91. goto error;
  92. /*
  93. * We should only get here if hold the mmap_sem in write mode ie on
  94. * registration of first mirror through hmm_mirror_register()
  95. */
  96. hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
  97. if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
  98. goto error_mm;
  99. return mm->hmm;
  100. error_mm:
  101. spin_lock(&mm->page_table_lock);
  102. if (mm->hmm == hmm)
  103. mm->hmm = NULL;
  104. spin_unlock(&mm->page_table_lock);
  105. error:
  106. kfree(hmm);
  107. return NULL;
  108. }
  109. void hmm_mm_destroy(struct mm_struct *mm)
  110. {
  111. kfree(mm->hmm);
  112. }
  113. static int hmm_invalidate_range(struct hmm *hmm, bool device,
  114. const struct hmm_update *update)
  115. {
  116. struct hmm_mirror *mirror;
  117. struct hmm_range *range;
  118. spin_lock(&hmm->lock);
  119. list_for_each_entry(range, &hmm->ranges, list) {
  120. unsigned long addr, idx, npages;
  121. if (update->end < range->start || update->start >= range->end)
  122. continue;
  123. range->valid = false;
  124. addr = max(update->start, range->start);
  125. idx = (addr - range->start) >> PAGE_SHIFT;
  126. npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
  127. memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
  128. }
  129. spin_unlock(&hmm->lock);
  130. if (!device)
  131. return 0;
  132. down_read(&hmm->mirrors_sem);
  133. list_for_each_entry(mirror, &hmm->mirrors, list) {
  134. int ret;
  135. ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
  136. if (!update->blockable && ret == -EAGAIN) {
  137. up_read(&hmm->mirrors_sem);
  138. return -EAGAIN;
  139. }
  140. }
  141. up_read(&hmm->mirrors_sem);
  142. return 0;
  143. }
  144. static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
  145. {
  146. struct hmm_mirror *mirror;
  147. struct hmm *hmm = mm->hmm;
  148. down_write(&hmm->mirrors_sem);
  149. mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
  150. list);
  151. while (mirror) {
  152. list_del_init(&mirror->list);
  153. if (mirror->ops->release) {
  154. /*
  155. * Drop mirrors_sem so callback can wait on any pending
  156. * work that might itself trigger mmu_notifier callback
  157. * and thus would deadlock with us.
  158. */
  159. up_write(&hmm->mirrors_sem);
  160. mirror->ops->release(mirror);
  161. down_write(&hmm->mirrors_sem);
  162. }
  163. mirror = list_first_entry_or_null(&hmm->mirrors,
  164. struct hmm_mirror, list);
  165. }
  166. up_write(&hmm->mirrors_sem);
  167. }
  168. static int hmm_invalidate_range_start(struct mmu_notifier *mn,
  169. struct mm_struct *mm,
  170. unsigned long start,
  171. unsigned long end,
  172. bool blockable)
  173. {
  174. struct hmm_update update;
  175. struct hmm *hmm = mm->hmm;
  176. VM_BUG_ON(!hmm);
  177. update.start = start;
  178. update.end = end;
  179. update.event = HMM_UPDATE_INVALIDATE;
  180. update.blockable = blockable;
  181. return hmm_invalidate_range(hmm, true, &update);
  182. }
  183. static void hmm_invalidate_range_end(struct mmu_notifier *mn,
  184. struct mm_struct *mm,
  185. unsigned long start,
  186. unsigned long end)
  187. {
  188. struct hmm_update update;
  189. struct hmm *hmm = mm->hmm;
  190. VM_BUG_ON(!hmm);
  191. update.start = start;
  192. update.end = end;
  193. update.event = HMM_UPDATE_INVALIDATE;
  194. update.blockable = true;
  195. hmm_invalidate_range(hmm, false, &update);
  196. }
  197. static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  198. .release = hmm_release,
  199. .invalidate_range_start = hmm_invalidate_range_start,
  200. .invalidate_range_end = hmm_invalidate_range_end,
  201. };
  202. /*
  203. * hmm_mirror_register() - register a mirror against an mm
  204. *
  205. * @mirror: new mirror struct to register
  206. * @mm: mm to register against
  207. *
  208. * To start mirroring a process address space, the device driver must register
  209. * an HMM mirror struct.
  210. *
  211. * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
  212. */
  213. int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
  214. {
  215. /* Sanity check */
  216. if (!mm || !mirror || !mirror->ops)
  217. return -EINVAL;
  218. again:
  219. mirror->hmm = hmm_register(mm);
  220. if (!mirror->hmm)
  221. return -ENOMEM;
  222. down_write(&mirror->hmm->mirrors_sem);
  223. if (mirror->hmm->mm == NULL) {
  224. /*
  225. * A racing hmm_mirror_unregister() is about to destroy the hmm
  226. * struct. Try again to allocate a new one.
  227. */
  228. up_write(&mirror->hmm->mirrors_sem);
  229. mirror->hmm = NULL;
  230. goto again;
  231. } else {
  232. list_add(&mirror->list, &mirror->hmm->mirrors);
  233. up_write(&mirror->hmm->mirrors_sem);
  234. }
  235. return 0;
  236. }
  237. EXPORT_SYMBOL(hmm_mirror_register);
  238. /*
  239. * hmm_mirror_unregister() - unregister a mirror
  240. *
  241. * @mirror: new mirror struct to register
  242. *
  243. * Stop mirroring a process address space, and cleanup.
  244. */
  245. void hmm_mirror_unregister(struct hmm_mirror *mirror)
  246. {
  247. bool should_unregister = false;
  248. struct mm_struct *mm;
  249. struct hmm *hmm;
  250. if (mirror->hmm == NULL)
  251. return;
  252. hmm = mirror->hmm;
  253. down_write(&hmm->mirrors_sem);
  254. list_del_init(&mirror->list);
  255. should_unregister = list_empty(&hmm->mirrors);
  256. mirror->hmm = NULL;
  257. mm = hmm->mm;
  258. hmm->mm = NULL;
  259. up_write(&hmm->mirrors_sem);
  260. if (!should_unregister || mm == NULL)
  261. return;
  262. mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
  263. spin_lock(&mm->page_table_lock);
  264. if (mm->hmm == hmm)
  265. mm->hmm = NULL;
  266. spin_unlock(&mm->page_table_lock);
  267. kfree(hmm);
  268. }
  269. EXPORT_SYMBOL(hmm_mirror_unregister);
  270. struct hmm_vma_walk {
  271. struct hmm_range *range;
  272. unsigned long last;
  273. bool fault;
  274. bool block;
  275. };
  276. static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
  277. bool write_fault, uint64_t *pfn)
  278. {
  279. unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
  280. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  281. struct hmm_range *range = hmm_vma_walk->range;
  282. struct vm_area_struct *vma = walk->vma;
  283. vm_fault_t ret;
  284. flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
  285. flags |= write_fault ? FAULT_FLAG_WRITE : 0;
  286. ret = handle_mm_fault(vma, addr, flags);
  287. if (ret & VM_FAULT_RETRY)
  288. return -EBUSY;
  289. if (ret & VM_FAULT_ERROR) {
  290. *pfn = range->values[HMM_PFN_ERROR];
  291. return -EFAULT;
  292. }
  293. return -EAGAIN;
  294. }
  295. static int hmm_pfns_bad(unsigned long addr,
  296. unsigned long end,
  297. struct mm_walk *walk)
  298. {
  299. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  300. struct hmm_range *range = hmm_vma_walk->range;
  301. uint64_t *pfns = range->pfns;
  302. unsigned long i;
  303. i = (addr - range->start) >> PAGE_SHIFT;
  304. for (; addr < end; addr += PAGE_SIZE, i++)
  305. pfns[i] = range->values[HMM_PFN_ERROR];
  306. return 0;
  307. }
  308. /*
  309. * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
  310. * @start: range virtual start address (inclusive)
  311. * @end: range virtual end address (exclusive)
  312. * @fault: should we fault or not ?
  313. * @write_fault: write fault ?
  314. * @walk: mm_walk structure
  315. * Returns: 0 on success, -EAGAIN after page fault, or page fault error
  316. *
  317. * This function will be called whenever pmd_none() or pte_none() returns true,
  318. * or whenever there is no page directory covering the virtual address range.
  319. */
  320. static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
  321. bool fault, bool write_fault,
  322. struct mm_walk *walk)
  323. {
  324. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  325. struct hmm_range *range = hmm_vma_walk->range;
  326. uint64_t *pfns = range->pfns;
  327. unsigned long i;
  328. hmm_vma_walk->last = addr;
  329. i = (addr - range->start) >> PAGE_SHIFT;
  330. for (; addr < end; addr += PAGE_SIZE, i++) {
  331. pfns[i] = range->values[HMM_PFN_NONE];
  332. if (fault || write_fault) {
  333. int ret;
  334. ret = hmm_vma_do_fault(walk, addr, write_fault,
  335. &pfns[i]);
  336. if (ret != -EAGAIN)
  337. return ret;
  338. }
  339. }
  340. return (fault || write_fault) ? -EAGAIN : 0;
  341. }
  342. static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  343. uint64_t pfns, uint64_t cpu_flags,
  344. bool *fault, bool *write_fault)
  345. {
  346. struct hmm_range *range = hmm_vma_walk->range;
  347. *fault = *write_fault = false;
  348. if (!hmm_vma_walk->fault)
  349. return;
  350. /* We aren't ask to do anything ... */
  351. if (!(pfns & range->flags[HMM_PFN_VALID]))
  352. return;
  353. /* If this is device memory than only fault if explicitly requested */
  354. if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
  355. /* Do we fault on device memory ? */
  356. if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
  357. *write_fault = pfns & range->flags[HMM_PFN_WRITE];
  358. *fault = true;
  359. }
  360. return;
  361. }
  362. /* If CPU page table is not valid then we need to fault */
  363. *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
  364. /* Need to write fault ? */
  365. if ((pfns & range->flags[HMM_PFN_WRITE]) &&
  366. !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
  367. *write_fault = true;
  368. *fault = true;
  369. }
  370. }
  371. static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  372. const uint64_t *pfns, unsigned long npages,
  373. uint64_t cpu_flags, bool *fault,
  374. bool *write_fault)
  375. {
  376. unsigned long i;
  377. if (!hmm_vma_walk->fault) {
  378. *fault = *write_fault = false;
  379. return;
  380. }
  381. for (i = 0; i < npages; ++i) {
  382. hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
  383. fault, write_fault);
  384. if ((*fault) || (*write_fault))
  385. return;
  386. }
  387. }
  388. static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
  389. struct mm_walk *walk)
  390. {
  391. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  392. struct hmm_range *range = hmm_vma_walk->range;
  393. bool fault, write_fault;
  394. unsigned long i, npages;
  395. uint64_t *pfns;
  396. i = (addr - range->start) >> PAGE_SHIFT;
  397. npages = (end - addr) >> PAGE_SHIFT;
  398. pfns = &range->pfns[i];
  399. hmm_range_need_fault(hmm_vma_walk, pfns, npages,
  400. 0, &fault, &write_fault);
  401. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  402. }
  403. static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
  404. {
  405. if (pmd_protnone(pmd))
  406. return 0;
  407. return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
  408. range->flags[HMM_PFN_WRITE] :
  409. range->flags[HMM_PFN_VALID];
  410. }
  411. static int hmm_vma_handle_pmd(struct mm_walk *walk,
  412. unsigned long addr,
  413. unsigned long end,
  414. uint64_t *pfns,
  415. pmd_t pmd)
  416. {
  417. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  418. struct hmm_range *range = hmm_vma_walk->range;
  419. unsigned long pfn, npages, i;
  420. bool fault, write_fault;
  421. uint64_t cpu_flags;
  422. npages = (end - addr) >> PAGE_SHIFT;
  423. cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
  424. hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
  425. &fault, &write_fault);
  426. if (pmd_protnone(pmd) || fault || write_fault)
  427. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  428. pfn = pmd_pfn(pmd) + pte_index(addr);
  429. for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
  430. pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
  431. hmm_vma_walk->last = end;
  432. return 0;
  433. }
  434. static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
  435. {
  436. if (pte_none(pte) || !pte_present(pte))
  437. return 0;
  438. return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
  439. range->flags[HMM_PFN_WRITE] :
  440. range->flags[HMM_PFN_VALID];
  441. }
  442. static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
  443. unsigned long end, pmd_t *pmdp, pte_t *ptep,
  444. uint64_t *pfn)
  445. {
  446. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  447. struct hmm_range *range = hmm_vma_walk->range;
  448. struct vm_area_struct *vma = walk->vma;
  449. bool fault, write_fault;
  450. uint64_t cpu_flags;
  451. pte_t pte = *ptep;
  452. uint64_t orig_pfn = *pfn;
  453. *pfn = range->values[HMM_PFN_NONE];
  454. cpu_flags = pte_to_hmm_pfn_flags(range, pte);
  455. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  456. &fault, &write_fault);
  457. if (pte_none(pte)) {
  458. if (fault || write_fault)
  459. goto fault;
  460. return 0;
  461. }
  462. if (!pte_present(pte)) {
  463. swp_entry_t entry = pte_to_swp_entry(pte);
  464. if (!non_swap_entry(entry)) {
  465. if (fault || write_fault)
  466. goto fault;
  467. return 0;
  468. }
  469. /*
  470. * This is a special swap entry, ignore migration, use
  471. * device and report anything else as error.
  472. */
  473. if (is_device_private_entry(entry)) {
  474. cpu_flags = range->flags[HMM_PFN_VALID] |
  475. range->flags[HMM_PFN_DEVICE_PRIVATE];
  476. cpu_flags |= is_write_device_private_entry(entry) ?
  477. range->flags[HMM_PFN_WRITE] : 0;
  478. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  479. &fault, &write_fault);
  480. if (fault || write_fault)
  481. goto fault;
  482. *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
  483. *pfn |= cpu_flags;
  484. return 0;
  485. }
  486. if (is_migration_entry(entry)) {
  487. if (fault || write_fault) {
  488. pte_unmap(ptep);
  489. hmm_vma_walk->last = addr;
  490. migration_entry_wait(vma->vm_mm,
  491. pmdp, addr);
  492. return -EAGAIN;
  493. }
  494. return 0;
  495. }
  496. /* Report error for everything else */
  497. *pfn = range->values[HMM_PFN_ERROR];
  498. return -EFAULT;
  499. }
  500. if (fault || write_fault)
  501. goto fault;
  502. *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
  503. return 0;
  504. fault:
  505. pte_unmap(ptep);
  506. /* Fault any virtual address we were asked to fault */
  507. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  508. }
  509. static int hmm_vma_walk_pmd(pmd_t *pmdp,
  510. unsigned long start,
  511. unsigned long end,
  512. struct mm_walk *walk)
  513. {
  514. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  515. struct hmm_range *range = hmm_vma_walk->range;
  516. struct vm_area_struct *vma = walk->vma;
  517. uint64_t *pfns = range->pfns;
  518. unsigned long addr = start, i;
  519. pte_t *ptep;
  520. pmd_t pmd;
  521. again:
  522. pmd = READ_ONCE(*pmdp);
  523. if (pmd_none(pmd))
  524. return hmm_vma_walk_hole(start, end, walk);
  525. if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
  526. return hmm_pfns_bad(start, end, walk);
  527. if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
  528. bool fault, write_fault;
  529. unsigned long npages;
  530. uint64_t *pfns;
  531. i = (addr - range->start) >> PAGE_SHIFT;
  532. npages = (end - addr) >> PAGE_SHIFT;
  533. pfns = &range->pfns[i];
  534. hmm_range_need_fault(hmm_vma_walk, pfns, npages,
  535. 0, &fault, &write_fault);
  536. if (fault || write_fault) {
  537. hmm_vma_walk->last = addr;
  538. pmd_migration_entry_wait(vma->vm_mm, pmdp);
  539. return -EAGAIN;
  540. }
  541. return 0;
  542. } else if (!pmd_present(pmd))
  543. return hmm_pfns_bad(start, end, walk);
  544. if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
  545. /*
  546. * No need to take pmd_lock here, even if some other threads
  547. * is splitting the huge pmd we will get that event through
  548. * mmu_notifier callback.
  549. *
  550. * So just read pmd value and check again its a transparent
  551. * huge or device mapping one and compute corresponding pfn
  552. * values.
  553. */
  554. pmd = pmd_read_atomic(pmdp);
  555. barrier();
  556. if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
  557. goto again;
  558. i = (addr - range->start) >> PAGE_SHIFT;
  559. return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
  560. }
  561. /*
  562. * We have handled all the valid case above ie either none, migration,
  563. * huge or transparent huge. At this point either it is a valid pmd
  564. * entry pointing to pte directory or it is a bad pmd that will not
  565. * recover.
  566. */
  567. if (pmd_bad(pmd))
  568. return hmm_pfns_bad(start, end, walk);
  569. ptep = pte_offset_map(pmdp, addr);
  570. i = (addr - range->start) >> PAGE_SHIFT;
  571. for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
  572. int r;
  573. r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
  574. if (r) {
  575. /* hmm_vma_handle_pte() did unmap pte directory */
  576. hmm_vma_walk->last = addr;
  577. return r;
  578. }
  579. }
  580. pte_unmap(ptep - 1);
  581. hmm_vma_walk->last = addr;
  582. return 0;
  583. }
  584. static void hmm_pfns_clear(struct hmm_range *range,
  585. uint64_t *pfns,
  586. unsigned long addr,
  587. unsigned long end)
  588. {
  589. for (; addr < end; addr += PAGE_SIZE, pfns++)
  590. *pfns = range->values[HMM_PFN_NONE];
  591. }
  592. static void hmm_pfns_special(struct hmm_range *range)
  593. {
  594. unsigned long addr = range->start, i = 0;
  595. for (; addr < range->end; addr += PAGE_SIZE, i++)
  596. range->pfns[i] = range->values[HMM_PFN_SPECIAL];
  597. }
  598. /*
  599. * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
  600. * @range: range being snapshotted
  601. * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  602. * vma permission, 0 success
  603. *
  604. * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  605. * validity is tracked by range struct. See hmm_vma_range_done() for further
  606. * information.
  607. *
  608. * The range struct is initialized here. It tracks the CPU page table, but only
  609. * if the function returns success (0), in which case the caller must then call
  610. * hmm_vma_range_done() to stop CPU page table update tracking on this range.
  611. *
  612. * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
  613. * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
  614. */
  615. int hmm_vma_get_pfns(struct hmm_range *range)
  616. {
  617. struct vm_area_struct *vma = range->vma;
  618. struct hmm_vma_walk hmm_vma_walk;
  619. struct mm_walk mm_walk;
  620. struct hmm *hmm;
  621. /* Sanity check, this really should not happen ! */
  622. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  623. return -EINVAL;
  624. if (range->end < vma->vm_start || range->end > vma->vm_end)
  625. return -EINVAL;
  626. hmm = hmm_register(vma->vm_mm);
  627. if (!hmm)
  628. return -ENOMEM;
  629. /* Caller must have registered a mirror, via hmm_mirror_register() ! */
  630. if (!hmm->mmu_notifier.ops)
  631. return -EINVAL;
  632. /* FIXME support hugetlb fs */
  633. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
  634. vma_is_dax(vma)) {
  635. hmm_pfns_special(range);
  636. return -EINVAL;
  637. }
  638. if (!(vma->vm_flags & VM_READ)) {
  639. /*
  640. * If vma do not allow read access, then assume that it does
  641. * not allow write access, either. Architecture that allow
  642. * write without read access are not supported by HMM, because
  643. * operations such has atomic access would not work.
  644. */
  645. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  646. return -EPERM;
  647. }
  648. /* Initialize range to track CPU page table update */
  649. spin_lock(&hmm->lock);
  650. range->valid = true;
  651. list_add_rcu(&range->list, &hmm->ranges);
  652. spin_unlock(&hmm->lock);
  653. hmm_vma_walk.fault = false;
  654. hmm_vma_walk.range = range;
  655. mm_walk.private = &hmm_vma_walk;
  656. mm_walk.vma = vma;
  657. mm_walk.mm = vma->vm_mm;
  658. mm_walk.pte_entry = NULL;
  659. mm_walk.test_walk = NULL;
  660. mm_walk.hugetlb_entry = NULL;
  661. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  662. mm_walk.pte_hole = hmm_vma_walk_hole;
  663. walk_page_range(range->start, range->end, &mm_walk);
  664. return 0;
  665. }
  666. EXPORT_SYMBOL(hmm_vma_get_pfns);
  667. /*
  668. * hmm_vma_range_done() - stop tracking change to CPU page table over a range
  669. * @range: range being tracked
  670. * Returns: false if range data has been invalidated, true otherwise
  671. *
  672. * Range struct is used to track updates to the CPU page table after a call to
  673. * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
  674. * using the data, or wants to lock updates to the data it got from those
  675. * functions, it must call the hmm_vma_range_done() function, which will then
  676. * stop tracking CPU page table updates.
  677. *
  678. * Note that device driver must still implement general CPU page table update
  679. * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
  680. * the mmu_notifier API directly.
  681. *
  682. * CPU page table update tracking done through hmm_range is only temporary and
  683. * to be used while trying to duplicate CPU page table contents for a range of
  684. * virtual addresses.
  685. *
  686. * There are two ways to use this :
  687. * again:
  688. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  689. * trans = device_build_page_table_update_transaction(pfns);
  690. * device_page_table_lock();
  691. * if (!hmm_vma_range_done(range)) {
  692. * device_page_table_unlock();
  693. * goto again;
  694. * }
  695. * device_commit_transaction(trans);
  696. * device_page_table_unlock();
  697. *
  698. * Or:
  699. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  700. * device_page_table_lock();
  701. * hmm_vma_range_done(range);
  702. * device_update_page_table(range->pfns);
  703. * device_page_table_unlock();
  704. */
  705. bool hmm_vma_range_done(struct hmm_range *range)
  706. {
  707. unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
  708. struct hmm *hmm;
  709. if (range->end <= range->start) {
  710. BUG();
  711. return false;
  712. }
  713. hmm = hmm_register(range->vma->vm_mm);
  714. if (!hmm) {
  715. memset(range->pfns, 0, sizeof(*range->pfns) * npages);
  716. return false;
  717. }
  718. spin_lock(&hmm->lock);
  719. list_del_rcu(&range->list);
  720. spin_unlock(&hmm->lock);
  721. return range->valid;
  722. }
  723. EXPORT_SYMBOL(hmm_vma_range_done);
  724. /*
  725. * hmm_vma_fault() - try to fault some address in a virtual address range
  726. * @range: range being faulted
  727. * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
  728. * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
  729. *
  730. * This is similar to a regular CPU page fault except that it will not trigger
  731. * any memory migration if the memory being faulted is not accessible by CPUs.
  732. *
  733. * On error, for one virtual address in the range, the function will mark the
  734. * corresponding HMM pfn entry with an error flag.
  735. *
  736. * Expected use pattern:
  737. * retry:
  738. * down_read(&mm->mmap_sem);
  739. * // Find vma and address device wants to fault, initialize hmm_pfn_t
  740. * // array accordingly
  741. * ret = hmm_vma_fault(range, write, block);
  742. * switch (ret) {
  743. * case -EAGAIN:
  744. * hmm_vma_range_done(range);
  745. * // You might want to rate limit or yield to play nicely, you may
  746. * // also commit any valid pfn in the array assuming that you are
  747. * // getting true from hmm_vma_range_monitor_end()
  748. * goto retry;
  749. * case 0:
  750. * break;
  751. * case -ENOMEM:
  752. * case -EINVAL:
  753. * case -EPERM:
  754. * default:
  755. * // Handle error !
  756. * up_read(&mm->mmap_sem)
  757. * return;
  758. * }
  759. * // Take device driver lock that serialize device page table update
  760. * driver_lock_device_page_table_update();
  761. * hmm_vma_range_done(range);
  762. * // Commit pfns we got from hmm_vma_fault()
  763. * driver_unlock_device_page_table_update();
  764. * up_read(&mm->mmap_sem)
  765. *
  766. * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
  767. * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
  768. *
  769. * YOU HAVE BEEN WARNED !
  770. */
  771. int hmm_vma_fault(struct hmm_range *range, bool block)
  772. {
  773. struct vm_area_struct *vma = range->vma;
  774. unsigned long start = range->start;
  775. struct hmm_vma_walk hmm_vma_walk;
  776. struct mm_walk mm_walk;
  777. struct hmm *hmm;
  778. int ret;
  779. /* Sanity check, this really should not happen ! */
  780. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  781. return -EINVAL;
  782. if (range->end < vma->vm_start || range->end > vma->vm_end)
  783. return -EINVAL;
  784. hmm = hmm_register(vma->vm_mm);
  785. if (!hmm) {
  786. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  787. return -ENOMEM;
  788. }
  789. /* Caller must have registered a mirror using hmm_mirror_register() */
  790. if (!hmm->mmu_notifier.ops)
  791. return -EINVAL;
  792. /* FIXME support hugetlb fs */
  793. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
  794. vma_is_dax(vma)) {
  795. hmm_pfns_special(range);
  796. return -EINVAL;
  797. }
  798. if (!(vma->vm_flags & VM_READ)) {
  799. /*
  800. * If vma do not allow read access, then assume that it does
  801. * not allow write access, either. Architecture that allow
  802. * write without read access are not supported by HMM, because
  803. * operations such has atomic access would not work.
  804. */
  805. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  806. return -EPERM;
  807. }
  808. /* Initialize range to track CPU page table update */
  809. spin_lock(&hmm->lock);
  810. range->valid = true;
  811. list_add_rcu(&range->list, &hmm->ranges);
  812. spin_unlock(&hmm->lock);
  813. hmm_vma_walk.fault = true;
  814. hmm_vma_walk.block = block;
  815. hmm_vma_walk.range = range;
  816. mm_walk.private = &hmm_vma_walk;
  817. hmm_vma_walk.last = range->start;
  818. mm_walk.vma = vma;
  819. mm_walk.mm = vma->vm_mm;
  820. mm_walk.pte_entry = NULL;
  821. mm_walk.test_walk = NULL;
  822. mm_walk.hugetlb_entry = NULL;
  823. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  824. mm_walk.pte_hole = hmm_vma_walk_hole;
  825. do {
  826. ret = walk_page_range(start, range->end, &mm_walk);
  827. start = hmm_vma_walk.last;
  828. } while (ret == -EAGAIN);
  829. if (ret) {
  830. unsigned long i;
  831. i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
  832. hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
  833. range->end);
  834. hmm_vma_range_done(range);
  835. }
  836. return ret;
  837. }
  838. EXPORT_SYMBOL(hmm_vma_fault);
  839. #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
  840. #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
  841. struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
  842. unsigned long addr)
  843. {
  844. struct page *page;
  845. page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
  846. if (!page)
  847. return NULL;
  848. lock_page(page);
  849. return page;
  850. }
  851. EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
  852. static void hmm_devmem_ref_release(struct percpu_ref *ref)
  853. {
  854. struct hmm_devmem *devmem;
  855. devmem = container_of(ref, struct hmm_devmem, ref);
  856. complete(&devmem->completion);
  857. }
  858. static void hmm_devmem_ref_exit(void *data)
  859. {
  860. struct percpu_ref *ref = data;
  861. struct hmm_devmem *devmem;
  862. devmem = container_of(ref, struct hmm_devmem, ref);
  863. percpu_ref_exit(ref);
  864. devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
  865. }
  866. static void hmm_devmem_ref_kill(void *data)
  867. {
  868. struct percpu_ref *ref = data;
  869. struct hmm_devmem *devmem;
  870. devmem = container_of(ref, struct hmm_devmem, ref);
  871. percpu_ref_kill(ref);
  872. wait_for_completion(&devmem->completion);
  873. devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
  874. }
  875. static int hmm_devmem_fault(struct vm_area_struct *vma,
  876. unsigned long addr,
  877. const struct page *page,
  878. unsigned int flags,
  879. pmd_t *pmdp)
  880. {
  881. struct hmm_devmem *devmem = page->pgmap->data;
  882. return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
  883. }
  884. static void hmm_devmem_free(struct page *page, void *data)
  885. {
  886. struct hmm_devmem *devmem = data;
  887. page->mapping = NULL;
  888. devmem->ops->free(devmem, page);
  889. }
  890. static DEFINE_MUTEX(hmm_devmem_lock);
  891. static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
  892. static void hmm_devmem_radix_release(struct resource *resource)
  893. {
  894. resource_size_t key;
  895. mutex_lock(&hmm_devmem_lock);
  896. for (key = resource->start;
  897. key <= resource->end;
  898. key += PA_SECTION_SIZE)
  899. radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
  900. mutex_unlock(&hmm_devmem_lock);
  901. }
  902. static void hmm_devmem_release(struct device *dev, void *data)
  903. {
  904. struct hmm_devmem *devmem = data;
  905. struct resource *resource = devmem->resource;
  906. unsigned long start_pfn, npages;
  907. struct zone *zone;
  908. struct page *page;
  909. if (percpu_ref_tryget_live(&devmem->ref)) {
  910. dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
  911. percpu_ref_put(&devmem->ref);
  912. }
  913. /* pages are dead and unused, undo the arch mapping */
  914. start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
  915. npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
  916. page = pfn_to_page(start_pfn);
  917. zone = page_zone(page);
  918. mem_hotplug_begin();
  919. if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
  920. __remove_pages(zone, start_pfn, npages, NULL);
  921. else
  922. arch_remove_memory(start_pfn << PAGE_SHIFT,
  923. npages << PAGE_SHIFT, NULL);
  924. mem_hotplug_done();
  925. hmm_devmem_radix_release(resource);
  926. }
  927. static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
  928. {
  929. resource_size_t key, align_start, align_size, align_end;
  930. struct device *device = devmem->device;
  931. int ret, nid, is_ram;
  932. align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
  933. align_size = ALIGN(devmem->resource->start +
  934. resource_size(devmem->resource),
  935. PA_SECTION_SIZE) - align_start;
  936. is_ram = region_intersects(align_start, align_size,
  937. IORESOURCE_SYSTEM_RAM,
  938. IORES_DESC_NONE);
  939. if (is_ram == REGION_MIXED) {
  940. WARN_ONCE(1, "%s attempted on mixed region %pr\n",
  941. __func__, devmem->resource);
  942. return -ENXIO;
  943. }
  944. if (is_ram == REGION_INTERSECTS)
  945. return -ENXIO;
  946. if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
  947. devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
  948. else
  949. devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
  950. devmem->pagemap.res = *devmem->resource;
  951. devmem->pagemap.page_fault = hmm_devmem_fault;
  952. devmem->pagemap.page_free = hmm_devmem_free;
  953. devmem->pagemap.dev = devmem->device;
  954. devmem->pagemap.ref = &devmem->ref;
  955. devmem->pagemap.data = devmem;
  956. mutex_lock(&hmm_devmem_lock);
  957. align_end = align_start + align_size - 1;
  958. for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
  959. struct hmm_devmem *dup;
  960. dup = radix_tree_lookup(&hmm_devmem_radix,
  961. key >> PA_SECTION_SHIFT);
  962. if (dup) {
  963. dev_err(device, "%s: collides with mapping for %s\n",
  964. __func__, dev_name(dup->device));
  965. mutex_unlock(&hmm_devmem_lock);
  966. ret = -EBUSY;
  967. goto error;
  968. }
  969. ret = radix_tree_insert(&hmm_devmem_radix,
  970. key >> PA_SECTION_SHIFT,
  971. devmem);
  972. if (ret) {
  973. dev_err(device, "%s: failed: %d\n", __func__, ret);
  974. mutex_unlock(&hmm_devmem_lock);
  975. goto error_radix;
  976. }
  977. }
  978. mutex_unlock(&hmm_devmem_lock);
  979. nid = dev_to_node(device);
  980. if (nid < 0)
  981. nid = numa_mem_id();
  982. mem_hotplug_begin();
  983. /*
  984. * For device private memory we call add_pages() as we only need to
  985. * allocate and initialize struct page for the device memory. More-
  986. * over the device memory is un-accessible thus we do not want to
  987. * create a linear mapping for the memory like arch_add_memory()
  988. * would do.
  989. *
  990. * For device public memory, which is accesible by the CPU, we do
  991. * want the linear mapping and thus use arch_add_memory().
  992. */
  993. if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
  994. ret = arch_add_memory(nid, align_start, align_size, NULL,
  995. false);
  996. else
  997. ret = add_pages(nid, align_start >> PAGE_SHIFT,
  998. align_size >> PAGE_SHIFT, NULL, false);
  999. if (ret) {
  1000. mem_hotplug_done();
  1001. goto error_add_memory;
  1002. }
  1003. move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
  1004. align_start >> PAGE_SHIFT,
  1005. align_size >> PAGE_SHIFT, NULL);
  1006. mem_hotplug_done();
  1007. /*
  1008. * Initialization of the pages has been deferred until now in order
  1009. * to allow us to do the work while not holding the hotplug lock.
  1010. */
  1011. memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
  1012. align_start >> PAGE_SHIFT,
  1013. align_size >> PAGE_SHIFT, &devmem->pagemap);
  1014. return 0;
  1015. error_add_memory:
  1016. untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
  1017. error_radix:
  1018. hmm_devmem_radix_release(devmem->resource);
  1019. error:
  1020. return ret;
  1021. }
  1022. static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
  1023. {
  1024. struct hmm_devmem *devmem = data;
  1025. return devmem->resource == match_data;
  1026. }
  1027. static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
  1028. {
  1029. devres_release(devmem->device, &hmm_devmem_release,
  1030. &hmm_devmem_match, devmem->resource);
  1031. }
  1032. /*
  1033. * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  1034. *
  1035. * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  1036. * @device: device struct to bind the resource too
  1037. * @size: size in bytes of the device memory to add
  1038. * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
  1039. *
  1040. * This function first finds an empty range of physical address big enough to
  1041. * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
  1042. * in turn allocates struct pages. It does not do anything beyond that; all
  1043. * events affecting the memory will go through the various callbacks provided
  1044. * by hmm_devmem_ops struct.
  1045. *
  1046. * Device driver should call this function during device initialization and
  1047. * is then responsible of memory management. HMM only provides helpers.
  1048. */
  1049. struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  1050. struct device *device,
  1051. unsigned long size)
  1052. {
  1053. struct hmm_devmem *devmem;
  1054. resource_size_t addr;
  1055. int ret;
  1056. dev_pagemap_get_ops();
  1057. devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
  1058. GFP_KERNEL, dev_to_node(device));
  1059. if (!devmem)
  1060. return ERR_PTR(-ENOMEM);
  1061. init_completion(&devmem->completion);
  1062. devmem->pfn_first = -1UL;
  1063. devmem->pfn_last = -1UL;
  1064. devmem->resource = NULL;
  1065. devmem->device = device;
  1066. devmem->ops = ops;
  1067. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1068. 0, GFP_KERNEL);
  1069. if (ret)
  1070. goto error_percpu_ref;
  1071. ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
  1072. if (ret)
  1073. goto error_devm_add_action;
  1074. size = ALIGN(size, PA_SECTION_SIZE);
  1075. addr = min((unsigned long)iomem_resource.end,
  1076. (1UL << MAX_PHYSMEM_BITS) - 1);
  1077. addr = addr - size + 1UL;
  1078. /*
  1079. * FIXME add a new helper to quickly walk resource tree and find free
  1080. * range
  1081. *
  1082. * FIXME what about ioport_resource resource ?
  1083. */
  1084. for (; addr > size && addr >= iomem_resource.start; addr -= size) {
  1085. ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
  1086. if (ret != REGION_DISJOINT)
  1087. continue;
  1088. devmem->resource = devm_request_mem_region(device, addr, size,
  1089. dev_name(device));
  1090. if (!devmem->resource) {
  1091. ret = -ENOMEM;
  1092. goto error_no_resource;
  1093. }
  1094. break;
  1095. }
  1096. if (!devmem->resource) {
  1097. ret = -ERANGE;
  1098. goto error_no_resource;
  1099. }
  1100. devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
  1101. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1102. devmem->pfn_last = devmem->pfn_first +
  1103. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1104. ret = hmm_devmem_pages_create(devmem);
  1105. if (ret)
  1106. goto error_pages;
  1107. devres_add(device, devmem);
  1108. ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
  1109. if (ret) {
  1110. hmm_devmem_remove(devmem);
  1111. return ERR_PTR(ret);
  1112. }
  1113. return devmem;
  1114. error_pages:
  1115. devm_release_mem_region(device, devmem->resource->start,
  1116. resource_size(devmem->resource));
  1117. error_no_resource:
  1118. error_devm_add_action:
  1119. hmm_devmem_ref_kill(&devmem->ref);
  1120. hmm_devmem_ref_exit(&devmem->ref);
  1121. error_percpu_ref:
  1122. devres_free(devmem);
  1123. return ERR_PTR(ret);
  1124. }
  1125. EXPORT_SYMBOL(hmm_devmem_add);
  1126. struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
  1127. struct device *device,
  1128. struct resource *res)
  1129. {
  1130. struct hmm_devmem *devmem;
  1131. int ret;
  1132. if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
  1133. return ERR_PTR(-EINVAL);
  1134. dev_pagemap_get_ops();
  1135. devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
  1136. GFP_KERNEL, dev_to_node(device));
  1137. if (!devmem)
  1138. return ERR_PTR(-ENOMEM);
  1139. init_completion(&devmem->completion);
  1140. devmem->pfn_first = -1UL;
  1141. devmem->pfn_last = -1UL;
  1142. devmem->resource = res;
  1143. devmem->device = device;
  1144. devmem->ops = ops;
  1145. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1146. 0, GFP_KERNEL);
  1147. if (ret)
  1148. goto error_percpu_ref;
  1149. ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
  1150. if (ret)
  1151. goto error_devm_add_action;
  1152. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1153. devmem->pfn_last = devmem->pfn_first +
  1154. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1155. ret = hmm_devmem_pages_create(devmem);
  1156. if (ret)
  1157. goto error_devm_add_action;
  1158. devres_add(device, devmem);
  1159. ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
  1160. if (ret) {
  1161. hmm_devmem_remove(devmem);
  1162. return ERR_PTR(ret);
  1163. }
  1164. return devmem;
  1165. error_devm_add_action:
  1166. hmm_devmem_ref_kill(&devmem->ref);
  1167. hmm_devmem_ref_exit(&devmem->ref);
  1168. error_percpu_ref:
  1169. devres_free(devmem);
  1170. return ERR_PTR(ret);
  1171. }
  1172. EXPORT_SYMBOL(hmm_devmem_add_resource);
  1173. /*
  1174. * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
  1175. *
  1176. * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
  1177. *
  1178. * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
  1179. * of the device driver. It will free struct page and remove the resource that
  1180. * reserved the physical address range for this device memory.
  1181. */
  1182. void hmm_devmem_remove(struct hmm_devmem *devmem)
  1183. {
  1184. resource_size_t start, size;
  1185. struct device *device;
  1186. bool cdm = false;
  1187. if (!devmem)
  1188. return;
  1189. device = devmem->device;
  1190. start = devmem->resource->start;
  1191. size = resource_size(devmem->resource);
  1192. cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
  1193. hmm_devmem_ref_kill(&devmem->ref);
  1194. hmm_devmem_ref_exit(&devmem->ref);
  1195. hmm_devmem_pages_remove(devmem);
  1196. if (!cdm)
  1197. devm_release_mem_region(device, start, size);
  1198. }
  1199. EXPORT_SYMBOL(hmm_devmem_remove);
  1200. /*
  1201. * A device driver that wants to handle multiple devices memory through a
  1202. * single fake device can use hmm_device to do so. This is purely a helper
  1203. * and it is not needed to make use of any HMM functionality.
  1204. */
  1205. #define HMM_DEVICE_MAX 256
  1206. static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
  1207. static DEFINE_SPINLOCK(hmm_device_lock);
  1208. static struct class *hmm_device_class;
  1209. static dev_t hmm_device_devt;
  1210. static void hmm_device_release(struct device *device)
  1211. {
  1212. struct hmm_device *hmm_device;
  1213. hmm_device = container_of(device, struct hmm_device, device);
  1214. spin_lock(&hmm_device_lock);
  1215. clear_bit(hmm_device->minor, hmm_device_mask);
  1216. spin_unlock(&hmm_device_lock);
  1217. kfree(hmm_device);
  1218. }
  1219. struct hmm_device *hmm_device_new(void *drvdata)
  1220. {
  1221. struct hmm_device *hmm_device;
  1222. hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
  1223. if (!hmm_device)
  1224. return ERR_PTR(-ENOMEM);
  1225. spin_lock(&hmm_device_lock);
  1226. hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
  1227. if (hmm_device->minor >= HMM_DEVICE_MAX) {
  1228. spin_unlock(&hmm_device_lock);
  1229. kfree(hmm_device);
  1230. return ERR_PTR(-EBUSY);
  1231. }
  1232. set_bit(hmm_device->minor, hmm_device_mask);
  1233. spin_unlock(&hmm_device_lock);
  1234. dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
  1235. hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
  1236. hmm_device->minor);
  1237. hmm_device->device.release = hmm_device_release;
  1238. dev_set_drvdata(&hmm_device->device, drvdata);
  1239. hmm_device->device.class = hmm_device_class;
  1240. device_initialize(&hmm_device->device);
  1241. return hmm_device;
  1242. }
  1243. EXPORT_SYMBOL(hmm_device_new);
  1244. void hmm_device_put(struct hmm_device *hmm_device)
  1245. {
  1246. put_device(&hmm_device->device);
  1247. }
  1248. EXPORT_SYMBOL(hmm_device_put);
  1249. static int __init hmm_init(void)
  1250. {
  1251. int ret;
  1252. ret = alloc_chrdev_region(&hmm_device_devt, 0,
  1253. HMM_DEVICE_MAX,
  1254. "hmm_device");
  1255. if (ret)
  1256. return ret;
  1257. hmm_device_class = class_create(THIS_MODULE, "hmm_device");
  1258. if (IS_ERR(hmm_device_class)) {
  1259. unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
  1260. return PTR_ERR(hmm_device_class);
  1261. }
  1262. return 0;
  1263. }
  1264. device_initcall(hmm_init);
  1265. #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */