hmm.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. /*
  2. * Copyright 2013 Red Hat Inc.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * Authors: Jérôme Glisse <jglisse@redhat.com>
  15. */
  16. /*
  17. * Refer to include/linux/hmm.h for information about heterogeneous memory
  18. * management or HMM for short.
  19. */
  20. #include <linux/mm.h>
  21. #include <linux/hmm.h>
  22. #include <linux/rmap.h>
  23. #include <linux/swap.h>
  24. #include <linux/slab.h>
  25. #include <linux/sched.h>
  26. #include <linux/swapops.h>
  27. #include <linux/hugetlb.h>
  28. #include <linux/jump_label.h>
  29. #include <linux/mmu_notifier.h>
  30. /*
  31. * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
  32. */
  33. DEFINE_STATIC_KEY_FALSE(device_private_key);
  34. EXPORT_SYMBOL(device_private_key);
  35. #ifdef CONFIG_HMM
  36. static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  37. /*
  38. * struct hmm - HMM per mm struct
  39. *
  40. * @mm: mm struct this HMM struct is bound to
  41. * @lock: lock protecting ranges list
  42. * @sequence: we track updates to the CPU page table with a sequence number
  43. * @ranges: list of range being snapshotted
  44. * @mirrors: list of mirrors for this mm
  45. * @mmu_notifier: mmu notifier to track updates to CPU page table
  46. * @mirrors_sem: read/write semaphore protecting the mirrors list
  47. */
  48. struct hmm {
  49. struct mm_struct *mm;
  50. spinlock_t lock;
  51. atomic_t sequence;
  52. struct list_head ranges;
  53. struct list_head mirrors;
  54. struct mmu_notifier mmu_notifier;
  55. struct rw_semaphore mirrors_sem;
  56. };
  57. /*
  58. * hmm_register - register HMM against an mm (HMM internal)
  59. *
  60. * @mm: mm struct to attach to
  61. *
  62. * This is not intended to be used directly by device drivers. It allocates an
  63. * HMM struct if mm does not have one, and initializes it.
  64. */
  65. static struct hmm *hmm_register(struct mm_struct *mm)
  66. {
  67. struct hmm *hmm = READ_ONCE(mm->hmm);
  68. bool cleanup = false;
  69. /*
  70. * The hmm struct can only be freed once the mm_struct goes away,
  71. * hence we should always have pre-allocated an new hmm struct
  72. * above.
  73. */
  74. if (hmm)
  75. return hmm;
  76. hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
  77. if (!hmm)
  78. return NULL;
  79. INIT_LIST_HEAD(&hmm->mirrors);
  80. init_rwsem(&hmm->mirrors_sem);
  81. atomic_set(&hmm->sequence, 0);
  82. hmm->mmu_notifier.ops = NULL;
  83. INIT_LIST_HEAD(&hmm->ranges);
  84. spin_lock_init(&hmm->lock);
  85. hmm->mm = mm;
  86. /*
  87. * We should only get here if hold the mmap_sem in write mode ie on
  88. * registration of first mirror through hmm_mirror_register()
  89. */
  90. hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
  91. if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
  92. kfree(hmm);
  93. return NULL;
  94. }
  95. spin_lock(&mm->page_table_lock);
  96. if (!mm->hmm)
  97. mm->hmm = hmm;
  98. else
  99. cleanup = true;
  100. spin_unlock(&mm->page_table_lock);
  101. if (cleanup) {
  102. mmu_notifier_unregister(&hmm->mmu_notifier, mm);
  103. kfree(hmm);
  104. }
  105. return mm->hmm;
  106. }
  107. void hmm_mm_destroy(struct mm_struct *mm)
  108. {
  109. kfree(mm->hmm);
  110. }
  111. #endif /* CONFIG_HMM */
  112. #if IS_ENABLED(CONFIG_HMM_MIRROR)
  113. static void hmm_invalidate_range(struct hmm *hmm,
  114. enum hmm_update_type action,
  115. unsigned long start,
  116. unsigned long end)
  117. {
  118. struct hmm_mirror *mirror;
  119. struct hmm_range *range;
  120. spin_lock(&hmm->lock);
  121. list_for_each_entry(range, &hmm->ranges, list) {
  122. unsigned long addr, idx, npages;
  123. if (end < range->start || start >= range->end)
  124. continue;
  125. range->valid = false;
  126. addr = max(start, range->start);
  127. idx = (addr - range->start) >> PAGE_SHIFT;
  128. npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
  129. memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
  130. }
  131. spin_unlock(&hmm->lock);
  132. down_read(&hmm->mirrors_sem);
  133. list_for_each_entry(mirror, &hmm->mirrors, list)
  134. mirror->ops->sync_cpu_device_pagetables(mirror, action,
  135. start, end);
  136. up_read(&hmm->mirrors_sem);
  137. }
  138. static void hmm_invalidate_range_start(struct mmu_notifier *mn,
  139. struct mm_struct *mm,
  140. unsigned long start,
  141. unsigned long end)
  142. {
  143. struct hmm *hmm = mm->hmm;
  144. VM_BUG_ON(!hmm);
  145. atomic_inc(&hmm->sequence);
  146. }
  147. static void hmm_invalidate_range_end(struct mmu_notifier *mn,
  148. struct mm_struct *mm,
  149. unsigned long start,
  150. unsigned long end)
  151. {
  152. struct hmm *hmm = mm->hmm;
  153. VM_BUG_ON(!hmm);
  154. hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
  155. }
  156. static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  157. .invalidate_range_start = hmm_invalidate_range_start,
  158. .invalidate_range_end = hmm_invalidate_range_end,
  159. };
  160. /*
  161. * hmm_mirror_register() - register a mirror against an mm
  162. *
  163. * @mirror: new mirror struct to register
  164. * @mm: mm to register against
  165. *
  166. * To start mirroring a process address space, the device driver must register
  167. * an HMM mirror struct.
  168. *
  169. * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
  170. */
  171. int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
  172. {
  173. /* Sanity check */
  174. if (!mm || !mirror || !mirror->ops)
  175. return -EINVAL;
  176. mirror->hmm = hmm_register(mm);
  177. if (!mirror->hmm)
  178. return -ENOMEM;
  179. down_write(&mirror->hmm->mirrors_sem);
  180. list_add(&mirror->list, &mirror->hmm->mirrors);
  181. up_write(&mirror->hmm->mirrors_sem);
  182. return 0;
  183. }
  184. EXPORT_SYMBOL(hmm_mirror_register);
  185. /*
  186. * hmm_mirror_unregister() - unregister a mirror
  187. *
  188. * @mirror: new mirror struct to register
  189. *
  190. * Stop mirroring a process address space, and cleanup.
  191. */
  192. void hmm_mirror_unregister(struct hmm_mirror *mirror)
  193. {
  194. struct hmm *hmm = mirror->hmm;
  195. down_write(&hmm->mirrors_sem);
  196. list_del(&mirror->list);
  197. up_write(&hmm->mirrors_sem);
  198. }
  199. EXPORT_SYMBOL(hmm_mirror_unregister);
  200. struct hmm_vma_walk {
  201. struct hmm_range *range;
  202. unsigned long last;
  203. bool fault;
  204. bool block;
  205. bool write;
  206. };
  207. static int hmm_vma_do_fault(struct mm_walk *walk,
  208. unsigned long addr,
  209. hmm_pfn_t *pfn)
  210. {
  211. unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
  212. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  213. struct vm_area_struct *vma = walk->vma;
  214. int r;
  215. flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
  216. flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
  217. r = handle_mm_fault(vma, addr, flags);
  218. if (r & VM_FAULT_RETRY)
  219. return -EBUSY;
  220. if (r & VM_FAULT_ERROR) {
  221. *pfn = HMM_PFN_ERROR;
  222. return -EFAULT;
  223. }
  224. return -EAGAIN;
  225. }
  226. static void hmm_pfns_special(hmm_pfn_t *pfns,
  227. unsigned long addr,
  228. unsigned long end)
  229. {
  230. for (; addr < end; addr += PAGE_SIZE, pfns++)
  231. *pfns = HMM_PFN_SPECIAL;
  232. }
  233. static int hmm_pfns_bad(unsigned long addr,
  234. unsigned long end,
  235. struct mm_walk *walk)
  236. {
  237. struct hmm_range *range = walk->private;
  238. hmm_pfn_t *pfns = range->pfns;
  239. unsigned long i;
  240. i = (addr - range->start) >> PAGE_SHIFT;
  241. for (; addr < end; addr += PAGE_SIZE, i++)
  242. pfns[i] = HMM_PFN_ERROR;
  243. return 0;
  244. }
  245. static void hmm_pfns_clear(hmm_pfn_t *pfns,
  246. unsigned long addr,
  247. unsigned long end)
  248. {
  249. for (; addr < end; addr += PAGE_SIZE, pfns++)
  250. *pfns = 0;
  251. }
  252. static int hmm_vma_walk_hole(unsigned long addr,
  253. unsigned long end,
  254. struct mm_walk *walk)
  255. {
  256. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  257. struct hmm_range *range = hmm_vma_walk->range;
  258. hmm_pfn_t *pfns = range->pfns;
  259. unsigned long i;
  260. hmm_vma_walk->last = addr;
  261. i = (addr - range->start) >> PAGE_SHIFT;
  262. for (; addr < end; addr += PAGE_SIZE, i++) {
  263. pfns[i] = HMM_PFN_EMPTY;
  264. if (hmm_vma_walk->fault) {
  265. int ret;
  266. ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
  267. if (ret != -EAGAIN)
  268. return ret;
  269. }
  270. }
  271. return hmm_vma_walk->fault ? -EAGAIN : 0;
  272. }
  273. static int hmm_vma_walk_clear(unsigned long addr,
  274. unsigned long end,
  275. struct mm_walk *walk)
  276. {
  277. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  278. struct hmm_range *range = hmm_vma_walk->range;
  279. hmm_pfn_t *pfns = range->pfns;
  280. unsigned long i;
  281. hmm_vma_walk->last = addr;
  282. i = (addr - range->start) >> PAGE_SHIFT;
  283. for (; addr < end; addr += PAGE_SIZE, i++) {
  284. pfns[i] = 0;
  285. if (hmm_vma_walk->fault) {
  286. int ret;
  287. ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
  288. if (ret != -EAGAIN)
  289. return ret;
  290. }
  291. }
  292. return hmm_vma_walk->fault ? -EAGAIN : 0;
  293. }
  294. static int hmm_vma_walk_pmd(pmd_t *pmdp,
  295. unsigned long start,
  296. unsigned long end,
  297. struct mm_walk *walk)
  298. {
  299. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  300. struct hmm_range *range = hmm_vma_walk->range;
  301. struct vm_area_struct *vma = walk->vma;
  302. hmm_pfn_t *pfns = range->pfns;
  303. unsigned long addr = start, i;
  304. bool write_fault;
  305. hmm_pfn_t flag;
  306. pte_t *ptep;
  307. i = (addr - range->start) >> PAGE_SHIFT;
  308. flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
  309. write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
  310. again:
  311. if (pmd_none(*pmdp))
  312. return hmm_vma_walk_hole(start, end, walk);
  313. if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
  314. return hmm_pfns_bad(start, end, walk);
  315. if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
  316. unsigned long pfn;
  317. pmd_t pmd;
  318. /*
  319. * No need to take pmd_lock here, even if some other threads
  320. * is splitting the huge pmd we will get that event through
  321. * mmu_notifier callback.
  322. *
  323. * So just read pmd value and check again its a transparent
  324. * huge or device mapping one and compute corresponding pfn
  325. * values.
  326. */
  327. pmd = pmd_read_atomic(pmdp);
  328. barrier();
  329. if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
  330. goto again;
  331. if (pmd_protnone(pmd))
  332. return hmm_vma_walk_clear(start, end, walk);
  333. if (write_fault && !pmd_write(pmd))
  334. return hmm_vma_walk_clear(start, end, walk);
  335. pfn = pmd_pfn(pmd) + pte_index(addr);
  336. flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
  337. for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
  338. pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
  339. return 0;
  340. }
  341. if (pmd_bad(*pmdp))
  342. return hmm_pfns_bad(start, end, walk);
  343. ptep = pte_offset_map(pmdp, addr);
  344. for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
  345. pte_t pte = *ptep;
  346. pfns[i] = 0;
  347. if (pte_none(pte)) {
  348. pfns[i] = HMM_PFN_EMPTY;
  349. if (hmm_vma_walk->fault)
  350. goto fault;
  351. continue;
  352. }
  353. if (!pte_present(pte)) {
  354. swp_entry_t entry;
  355. if (!non_swap_entry(entry)) {
  356. if (hmm_vma_walk->fault)
  357. goto fault;
  358. continue;
  359. }
  360. entry = pte_to_swp_entry(pte);
  361. /*
  362. * This is a special swap entry, ignore migration, use
  363. * device and report anything else as error.
  364. */
  365. if (is_migration_entry(entry)) {
  366. if (hmm_vma_walk->fault) {
  367. pte_unmap(ptep);
  368. hmm_vma_walk->last = addr;
  369. migration_entry_wait(vma->vm_mm,
  370. pmdp, addr);
  371. return -EAGAIN;
  372. }
  373. continue;
  374. } else {
  375. /* Report error for everything else */
  376. pfns[i] = HMM_PFN_ERROR;
  377. }
  378. continue;
  379. }
  380. if (write_fault && !pte_write(pte))
  381. goto fault;
  382. pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
  383. pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
  384. continue;
  385. fault:
  386. pte_unmap(ptep);
  387. /* Fault all pages in range */
  388. return hmm_vma_walk_clear(start, end, walk);
  389. }
  390. pte_unmap(ptep - 1);
  391. return 0;
  392. }
  393. /*
  394. * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
  395. * @vma: virtual memory area containing the virtual address range
  396. * @range: used to track snapshot validity
  397. * @start: range virtual start address (inclusive)
  398. * @end: range virtual end address (exclusive)
  399. * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
  400. * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
  401. *
  402. * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  403. * validity is tracked by range struct. See hmm_vma_range_done() for further
  404. * information.
  405. *
  406. * The range struct is initialized here. It tracks the CPU page table, but only
  407. * if the function returns success (0), in which case the caller must then call
  408. * hmm_vma_range_done() to stop CPU page table update tracking on this range.
  409. *
  410. * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
  411. * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
  412. */
  413. int hmm_vma_get_pfns(struct vm_area_struct *vma,
  414. struct hmm_range *range,
  415. unsigned long start,
  416. unsigned long end,
  417. hmm_pfn_t *pfns)
  418. {
  419. struct hmm_vma_walk hmm_vma_walk;
  420. struct mm_walk mm_walk;
  421. struct hmm *hmm;
  422. /* FIXME support hugetlb fs */
  423. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
  424. hmm_pfns_special(pfns, start, end);
  425. return -EINVAL;
  426. }
  427. /* Sanity check, this really should not happen ! */
  428. if (start < vma->vm_start || start >= vma->vm_end)
  429. return -EINVAL;
  430. if (end < vma->vm_start || end > vma->vm_end)
  431. return -EINVAL;
  432. hmm = hmm_register(vma->vm_mm);
  433. if (!hmm)
  434. return -ENOMEM;
  435. /* Caller must have registered a mirror, via hmm_mirror_register() ! */
  436. if (!hmm->mmu_notifier.ops)
  437. return -EINVAL;
  438. /* Initialize range to track CPU page table update */
  439. range->start = start;
  440. range->pfns = pfns;
  441. range->end = end;
  442. spin_lock(&hmm->lock);
  443. range->valid = true;
  444. list_add_rcu(&range->list, &hmm->ranges);
  445. spin_unlock(&hmm->lock);
  446. hmm_vma_walk.fault = false;
  447. hmm_vma_walk.range = range;
  448. mm_walk.private = &hmm_vma_walk;
  449. mm_walk.vma = vma;
  450. mm_walk.mm = vma->vm_mm;
  451. mm_walk.pte_entry = NULL;
  452. mm_walk.test_walk = NULL;
  453. mm_walk.hugetlb_entry = NULL;
  454. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  455. mm_walk.pte_hole = hmm_vma_walk_hole;
  456. walk_page_range(start, end, &mm_walk);
  457. return 0;
  458. }
  459. EXPORT_SYMBOL(hmm_vma_get_pfns);
  460. /*
  461. * hmm_vma_range_done() - stop tracking change to CPU page table over a range
  462. * @vma: virtual memory area containing the virtual address range
  463. * @range: range being tracked
  464. * Returns: false if range data has been invalidated, true otherwise
  465. *
  466. * Range struct is used to track updates to the CPU page table after a call to
  467. * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
  468. * using the data, or wants to lock updates to the data it got from those
  469. * functions, it must call the hmm_vma_range_done() function, which will then
  470. * stop tracking CPU page table updates.
  471. *
  472. * Note that device driver must still implement general CPU page table update
  473. * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
  474. * the mmu_notifier API directly.
  475. *
  476. * CPU page table update tracking done through hmm_range is only temporary and
  477. * to be used while trying to duplicate CPU page table contents for a range of
  478. * virtual addresses.
  479. *
  480. * There are two ways to use this :
  481. * again:
  482. * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
  483. * trans = device_build_page_table_update_transaction(pfns);
  484. * device_page_table_lock();
  485. * if (!hmm_vma_range_done(vma, range)) {
  486. * device_page_table_unlock();
  487. * goto again;
  488. * }
  489. * device_commit_transaction(trans);
  490. * device_page_table_unlock();
  491. *
  492. * Or:
  493. * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
  494. * device_page_table_lock();
  495. * hmm_vma_range_done(vma, range);
  496. * device_update_page_table(pfns);
  497. * device_page_table_unlock();
  498. */
  499. bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
  500. {
  501. unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
  502. struct hmm *hmm;
  503. if (range->end <= range->start) {
  504. BUG();
  505. return false;
  506. }
  507. hmm = hmm_register(vma->vm_mm);
  508. if (!hmm) {
  509. memset(range->pfns, 0, sizeof(*range->pfns) * npages);
  510. return false;
  511. }
  512. spin_lock(&hmm->lock);
  513. list_del_rcu(&range->list);
  514. spin_unlock(&hmm->lock);
  515. return range->valid;
  516. }
  517. EXPORT_SYMBOL(hmm_vma_range_done);
  518. /*
  519. * hmm_vma_fault() - try to fault some address in a virtual address range
  520. * @vma: virtual memory area containing the virtual address range
  521. * @range: use to track pfns array content validity
  522. * @start: fault range virtual start address (inclusive)
  523. * @end: fault range virtual end address (exclusive)
  524. * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
  525. * @write: is it a write fault
  526. * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
  527. * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
  528. *
  529. * This is similar to a regular CPU page fault except that it will not trigger
  530. * any memory migration if the memory being faulted is not accessible by CPUs.
  531. *
  532. * On error, for one virtual address in the range, the function will set the
  533. * hmm_pfn_t error flag for the corresponding pfn entry.
  534. *
  535. * Expected use pattern:
  536. * retry:
  537. * down_read(&mm->mmap_sem);
  538. * // Find vma and address device wants to fault, initialize hmm_pfn_t
  539. * // array accordingly
  540. * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
  541. * switch (ret) {
  542. * case -EAGAIN:
  543. * hmm_vma_range_done(vma, range);
  544. * // You might want to rate limit or yield to play nicely, you may
  545. * // also commit any valid pfn in the array assuming that you are
  546. * // getting true from hmm_vma_range_monitor_end()
  547. * goto retry;
  548. * case 0:
  549. * break;
  550. * default:
  551. * // Handle error !
  552. * up_read(&mm->mmap_sem)
  553. * return;
  554. * }
  555. * // Take device driver lock that serialize device page table update
  556. * driver_lock_device_page_table_update();
  557. * hmm_vma_range_done(vma, range);
  558. * // Commit pfns we got from hmm_vma_fault()
  559. * driver_unlock_device_page_table_update();
  560. * up_read(&mm->mmap_sem)
  561. *
  562. * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
  563. * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
  564. *
  565. * YOU HAVE BEEN WARNED !
  566. */
  567. int hmm_vma_fault(struct vm_area_struct *vma,
  568. struct hmm_range *range,
  569. unsigned long start,
  570. unsigned long end,
  571. hmm_pfn_t *pfns,
  572. bool write,
  573. bool block)
  574. {
  575. struct hmm_vma_walk hmm_vma_walk;
  576. struct mm_walk mm_walk;
  577. struct hmm *hmm;
  578. int ret;
  579. /* Sanity check, this really should not happen ! */
  580. if (start < vma->vm_start || start >= vma->vm_end)
  581. return -EINVAL;
  582. if (end < vma->vm_start || end > vma->vm_end)
  583. return -EINVAL;
  584. hmm = hmm_register(vma->vm_mm);
  585. if (!hmm) {
  586. hmm_pfns_clear(pfns, start, end);
  587. return -ENOMEM;
  588. }
  589. /* Caller must have registered a mirror using hmm_mirror_register() */
  590. if (!hmm->mmu_notifier.ops)
  591. return -EINVAL;
  592. /* Initialize range to track CPU page table update */
  593. range->start = start;
  594. range->pfns = pfns;
  595. range->end = end;
  596. spin_lock(&hmm->lock);
  597. range->valid = true;
  598. list_add_rcu(&range->list, &hmm->ranges);
  599. spin_unlock(&hmm->lock);
  600. /* FIXME support hugetlb fs */
  601. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
  602. hmm_pfns_special(pfns, start, end);
  603. return 0;
  604. }
  605. hmm_vma_walk.fault = true;
  606. hmm_vma_walk.write = write;
  607. hmm_vma_walk.block = block;
  608. hmm_vma_walk.range = range;
  609. mm_walk.private = &hmm_vma_walk;
  610. hmm_vma_walk.last = range->start;
  611. mm_walk.vma = vma;
  612. mm_walk.mm = vma->vm_mm;
  613. mm_walk.pte_entry = NULL;
  614. mm_walk.test_walk = NULL;
  615. mm_walk.hugetlb_entry = NULL;
  616. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  617. mm_walk.pte_hole = hmm_vma_walk_hole;
  618. do {
  619. ret = walk_page_range(start, end, &mm_walk);
  620. start = hmm_vma_walk.last;
  621. } while (ret == -EAGAIN);
  622. if (ret) {
  623. unsigned long i;
  624. i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
  625. hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
  626. hmm_vma_range_done(vma, range);
  627. }
  628. return ret;
  629. }
  630. EXPORT_SYMBOL(hmm_vma_fault);
  631. #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */