hmm.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419
  1. /*
  2. * Copyright 2013 Red Hat Inc.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * Authors: Jérôme Glisse <jglisse@redhat.com>
  15. */
  16. /*
  17. * Refer to include/linux/hmm.h for information about heterogeneous memory
  18. * management or HMM for short.
  19. */
  20. #include <linux/mm.h>
  21. #include <linux/hmm.h>
  22. #include <linux/init.h>
  23. #include <linux/rmap.h>
  24. #include <linux/swap.h>
  25. #include <linux/slab.h>
  26. #include <linux/sched.h>
  27. #include <linux/mmzone.h>
  28. #include <linux/pagemap.h>
  29. #include <linux/swapops.h>
  30. #include <linux/hugetlb.h>
  31. #include <linux/memremap.h>
  32. #include <linux/jump_label.h>
  33. #include <linux/mmu_notifier.h>
  34. #include <linux/memory_hotplug.h>
  35. #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
  36. #if IS_ENABLED(CONFIG_HMM_MIRROR)
  37. static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  38. /*
  39. * struct hmm - HMM per mm struct
  40. *
  41. * @mm: mm struct this HMM struct is bound to
  42. * @lock: lock protecting ranges list
  43. * @sequence: we track updates to the CPU page table with a sequence number
  44. * @ranges: list of range being snapshotted
  45. * @mirrors: list of mirrors for this mm
  46. * @mmu_notifier: mmu notifier to track updates to CPU page table
  47. * @mirrors_sem: read/write semaphore protecting the mirrors list
  48. */
  49. struct hmm {
  50. struct mm_struct *mm;
  51. spinlock_t lock;
  52. atomic_t sequence;
  53. struct list_head ranges;
  54. struct list_head mirrors;
  55. struct mmu_notifier mmu_notifier;
  56. struct rw_semaphore mirrors_sem;
  57. };
  58. /*
  59. * hmm_register - register HMM against an mm (HMM internal)
  60. *
  61. * @mm: mm struct to attach to
  62. *
  63. * This is not intended to be used directly by device drivers. It allocates an
  64. * HMM struct if mm does not have one, and initializes it.
  65. */
  66. static struct hmm *hmm_register(struct mm_struct *mm)
  67. {
  68. struct hmm *hmm = READ_ONCE(mm->hmm);
  69. bool cleanup = false;
  70. /*
  71. * The hmm struct can only be freed once the mm_struct goes away,
  72. * hence we should always have pre-allocated an new hmm struct
  73. * above.
  74. */
  75. if (hmm)
  76. return hmm;
  77. hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
  78. if (!hmm)
  79. return NULL;
  80. INIT_LIST_HEAD(&hmm->mirrors);
  81. init_rwsem(&hmm->mirrors_sem);
  82. atomic_set(&hmm->sequence, 0);
  83. hmm->mmu_notifier.ops = NULL;
  84. INIT_LIST_HEAD(&hmm->ranges);
  85. spin_lock_init(&hmm->lock);
  86. hmm->mm = mm;
  87. /*
  88. * We should only get here if hold the mmap_sem in write mode ie on
  89. * registration of first mirror through hmm_mirror_register()
  90. */
  91. hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
  92. if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
  93. kfree(hmm);
  94. return NULL;
  95. }
  96. spin_lock(&mm->page_table_lock);
  97. if (!mm->hmm)
  98. mm->hmm = hmm;
  99. else
  100. cleanup = true;
  101. spin_unlock(&mm->page_table_lock);
  102. if (cleanup) {
  103. mmu_notifier_unregister(&hmm->mmu_notifier, mm);
  104. kfree(hmm);
  105. }
  106. return mm->hmm;
  107. }
  108. void hmm_mm_destroy(struct mm_struct *mm)
  109. {
  110. kfree(mm->hmm);
  111. }
  112. static void hmm_invalidate_range(struct hmm *hmm,
  113. enum hmm_update_type action,
  114. unsigned long start,
  115. unsigned long end)
  116. {
  117. struct hmm_mirror *mirror;
  118. struct hmm_range *range;
  119. spin_lock(&hmm->lock);
  120. list_for_each_entry(range, &hmm->ranges, list) {
  121. unsigned long addr, idx, npages;
  122. if (end < range->start || start >= range->end)
  123. continue;
  124. range->valid = false;
  125. addr = max(start, range->start);
  126. idx = (addr - range->start) >> PAGE_SHIFT;
  127. npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
  128. memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
  129. }
  130. spin_unlock(&hmm->lock);
  131. down_read(&hmm->mirrors_sem);
  132. list_for_each_entry(mirror, &hmm->mirrors, list)
  133. mirror->ops->sync_cpu_device_pagetables(mirror, action,
  134. start, end);
  135. up_read(&hmm->mirrors_sem);
  136. }
  137. static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
  138. {
  139. struct hmm_mirror *mirror;
  140. struct hmm *hmm = mm->hmm;
  141. down_write(&hmm->mirrors_sem);
  142. mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
  143. list);
  144. while (mirror) {
  145. list_del_init(&mirror->list);
  146. if (mirror->ops->release) {
  147. /*
  148. * Drop mirrors_sem so callback can wait on any pending
  149. * work that might itself trigger mmu_notifier callback
  150. * and thus would deadlock with us.
  151. */
  152. up_write(&hmm->mirrors_sem);
  153. mirror->ops->release(mirror);
  154. down_write(&hmm->mirrors_sem);
  155. }
  156. mirror = list_first_entry_or_null(&hmm->mirrors,
  157. struct hmm_mirror, list);
  158. }
  159. up_write(&hmm->mirrors_sem);
  160. }
  161. static void hmm_invalidate_range_start(struct mmu_notifier *mn,
  162. struct mm_struct *mm,
  163. unsigned long start,
  164. unsigned long end)
  165. {
  166. struct hmm *hmm = mm->hmm;
  167. VM_BUG_ON(!hmm);
  168. atomic_inc(&hmm->sequence);
  169. }
  170. static void hmm_invalidate_range_end(struct mmu_notifier *mn,
  171. struct mm_struct *mm,
  172. unsigned long start,
  173. unsigned long end)
  174. {
  175. struct hmm *hmm = mm->hmm;
  176. VM_BUG_ON(!hmm);
  177. hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
  178. }
  179. static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  180. .release = hmm_release,
  181. .invalidate_range_start = hmm_invalidate_range_start,
  182. .invalidate_range_end = hmm_invalidate_range_end,
  183. };
  184. /*
  185. * hmm_mirror_register() - register a mirror against an mm
  186. *
  187. * @mirror: new mirror struct to register
  188. * @mm: mm to register against
  189. *
  190. * To start mirroring a process address space, the device driver must register
  191. * an HMM mirror struct.
  192. *
  193. * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
  194. */
  195. int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
  196. {
  197. /* Sanity check */
  198. if (!mm || !mirror || !mirror->ops)
  199. return -EINVAL;
  200. again:
  201. mirror->hmm = hmm_register(mm);
  202. if (!mirror->hmm)
  203. return -ENOMEM;
  204. down_write(&mirror->hmm->mirrors_sem);
  205. if (mirror->hmm->mm == NULL) {
  206. /*
  207. * A racing hmm_mirror_unregister() is about to destroy the hmm
  208. * struct. Try again to allocate a new one.
  209. */
  210. up_write(&mirror->hmm->mirrors_sem);
  211. mirror->hmm = NULL;
  212. goto again;
  213. } else {
  214. list_add(&mirror->list, &mirror->hmm->mirrors);
  215. up_write(&mirror->hmm->mirrors_sem);
  216. }
  217. return 0;
  218. }
  219. EXPORT_SYMBOL(hmm_mirror_register);
  220. /*
  221. * hmm_mirror_unregister() - unregister a mirror
  222. *
  223. * @mirror: new mirror struct to register
  224. *
  225. * Stop mirroring a process address space, and cleanup.
  226. */
  227. void hmm_mirror_unregister(struct hmm_mirror *mirror)
  228. {
  229. bool should_unregister = false;
  230. struct mm_struct *mm;
  231. struct hmm *hmm;
  232. if (mirror->hmm == NULL)
  233. return;
  234. hmm = mirror->hmm;
  235. down_write(&hmm->mirrors_sem);
  236. list_del_init(&mirror->list);
  237. should_unregister = list_empty(&hmm->mirrors);
  238. mirror->hmm = NULL;
  239. mm = hmm->mm;
  240. hmm->mm = NULL;
  241. up_write(&hmm->mirrors_sem);
  242. if (!should_unregister || mm == NULL)
  243. return;
  244. spin_lock(&mm->page_table_lock);
  245. if (mm->hmm == hmm)
  246. mm->hmm = NULL;
  247. spin_unlock(&mm->page_table_lock);
  248. mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
  249. kfree(hmm);
  250. }
  251. EXPORT_SYMBOL(hmm_mirror_unregister);
  252. struct hmm_vma_walk {
  253. struct hmm_range *range;
  254. unsigned long last;
  255. bool fault;
  256. bool block;
  257. };
  258. static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
  259. bool write_fault, uint64_t *pfn)
  260. {
  261. unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
  262. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  263. struct hmm_range *range = hmm_vma_walk->range;
  264. struct vm_area_struct *vma = walk->vma;
  265. vm_fault_t ret;
  266. flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
  267. flags |= write_fault ? FAULT_FLAG_WRITE : 0;
  268. ret = handle_mm_fault(vma, addr, flags);
  269. if (ret & VM_FAULT_RETRY)
  270. return -EBUSY;
  271. if (ret & VM_FAULT_ERROR) {
  272. *pfn = range->values[HMM_PFN_ERROR];
  273. return -EFAULT;
  274. }
  275. return -EAGAIN;
  276. }
  277. static int hmm_pfns_bad(unsigned long addr,
  278. unsigned long end,
  279. struct mm_walk *walk)
  280. {
  281. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  282. struct hmm_range *range = hmm_vma_walk->range;
  283. uint64_t *pfns = range->pfns;
  284. unsigned long i;
  285. i = (addr - range->start) >> PAGE_SHIFT;
  286. for (; addr < end; addr += PAGE_SIZE, i++)
  287. pfns[i] = range->values[HMM_PFN_ERROR];
  288. return 0;
  289. }
  290. /*
  291. * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
  292. * @start: range virtual start address (inclusive)
  293. * @end: range virtual end address (exclusive)
  294. * @fault: should we fault or not ?
  295. * @write_fault: write fault ?
  296. * @walk: mm_walk structure
  297. * Returns: 0 on success, -EAGAIN after page fault, or page fault error
  298. *
  299. * This function will be called whenever pmd_none() or pte_none() returns true,
  300. * or whenever there is no page directory covering the virtual address range.
  301. */
  302. static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
  303. bool fault, bool write_fault,
  304. struct mm_walk *walk)
  305. {
  306. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  307. struct hmm_range *range = hmm_vma_walk->range;
  308. uint64_t *pfns = range->pfns;
  309. unsigned long i;
  310. hmm_vma_walk->last = addr;
  311. i = (addr - range->start) >> PAGE_SHIFT;
  312. for (; addr < end; addr += PAGE_SIZE, i++) {
  313. pfns[i] = range->values[HMM_PFN_NONE];
  314. if (fault || write_fault) {
  315. int ret;
  316. ret = hmm_vma_do_fault(walk, addr, write_fault,
  317. &pfns[i]);
  318. if (ret != -EAGAIN)
  319. return ret;
  320. }
  321. }
  322. return (fault || write_fault) ? -EAGAIN : 0;
  323. }
  324. static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  325. uint64_t pfns, uint64_t cpu_flags,
  326. bool *fault, bool *write_fault)
  327. {
  328. struct hmm_range *range = hmm_vma_walk->range;
  329. *fault = *write_fault = false;
  330. if (!hmm_vma_walk->fault)
  331. return;
  332. /* We aren't ask to do anything ... */
  333. if (!(pfns & range->flags[HMM_PFN_VALID]))
  334. return;
  335. /* If this is device memory than only fault if explicitly requested */
  336. if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
  337. /* Do we fault on device memory ? */
  338. if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
  339. *write_fault = pfns & range->flags[HMM_PFN_WRITE];
  340. *fault = true;
  341. }
  342. return;
  343. }
  344. /* If CPU page table is not valid then we need to fault */
  345. *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
  346. /* Need to write fault ? */
  347. if ((pfns & range->flags[HMM_PFN_WRITE]) &&
  348. !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
  349. *write_fault = true;
  350. *fault = true;
  351. }
  352. }
  353. static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  354. const uint64_t *pfns, unsigned long npages,
  355. uint64_t cpu_flags, bool *fault,
  356. bool *write_fault)
  357. {
  358. unsigned long i;
  359. if (!hmm_vma_walk->fault) {
  360. *fault = *write_fault = false;
  361. return;
  362. }
  363. for (i = 0; i < npages; ++i) {
  364. hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
  365. fault, write_fault);
  366. if ((*fault) || (*write_fault))
  367. return;
  368. }
  369. }
  370. static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
  371. struct mm_walk *walk)
  372. {
  373. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  374. struct hmm_range *range = hmm_vma_walk->range;
  375. bool fault, write_fault;
  376. unsigned long i, npages;
  377. uint64_t *pfns;
  378. i = (addr - range->start) >> PAGE_SHIFT;
  379. npages = (end - addr) >> PAGE_SHIFT;
  380. pfns = &range->pfns[i];
  381. hmm_range_need_fault(hmm_vma_walk, pfns, npages,
  382. 0, &fault, &write_fault);
  383. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  384. }
  385. static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
  386. {
  387. if (pmd_protnone(pmd))
  388. return 0;
  389. return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
  390. range->flags[HMM_PFN_WRITE] :
  391. range->flags[HMM_PFN_VALID];
  392. }
  393. static int hmm_vma_handle_pmd(struct mm_walk *walk,
  394. unsigned long addr,
  395. unsigned long end,
  396. uint64_t *pfns,
  397. pmd_t pmd)
  398. {
  399. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  400. struct hmm_range *range = hmm_vma_walk->range;
  401. unsigned long pfn, npages, i;
  402. bool fault, write_fault;
  403. uint64_t cpu_flags;
  404. npages = (end - addr) >> PAGE_SHIFT;
  405. cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
  406. hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
  407. &fault, &write_fault);
  408. if (pmd_protnone(pmd) || fault || write_fault)
  409. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  410. pfn = pmd_pfn(pmd) + pte_index(addr);
  411. for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
  412. pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
  413. hmm_vma_walk->last = end;
  414. return 0;
  415. }
  416. static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
  417. {
  418. if (pte_none(pte) || !pte_present(pte))
  419. return 0;
  420. return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
  421. range->flags[HMM_PFN_WRITE] :
  422. range->flags[HMM_PFN_VALID];
  423. }
  424. static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
  425. unsigned long end, pmd_t *pmdp, pte_t *ptep,
  426. uint64_t *pfn)
  427. {
  428. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  429. struct hmm_range *range = hmm_vma_walk->range;
  430. struct vm_area_struct *vma = walk->vma;
  431. bool fault, write_fault;
  432. uint64_t cpu_flags;
  433. pte_t pte = *ptep;
  434. uint64_t orig_pfn = *pfn;
  435. *pfn = range->values[HMM_PFN_NONE];
  436. cpu_flags = pte_to_hmm_pfn_flags(range, pte);
  437. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  438. &fault, &write_fault);
  439. if (pte_none(pte)) {
  440. if (fault || write_fault)
  441. goto fault;
  442. return 0;
  443. }
  444. if (!pte_present(pte)) {
  445. swp_entry_t entry = pte_to_swp_entry(pte);
  446. if (!non_swap_entry(entry)) {
  447. if (fault || write_fault)
  448. goto fault;
  449. return 0;
  450. }
  451. /*
  452. * This is a special swap entry, ignore migration, use
  453. * device and report anything else as error.
  454. */
  455. if (is_device_private_entry(entry)) {
  456. cpu_flags = range->flags[HMM_PFN_VALID] |
  457. range->flags[HMM_PFN_DEVICE_PRIVATE];
  458. cpu_flags |= is_write_device_private_entry(entry) ?
  459. range->flags[HMM_PFN_WRITE] : 0;
  460. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  461. &fault, &write_fault);
  462. if (fault || write_fault)
  463. goto fault;
  464. *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
  465. *pfn |= cpu_flags;
  466. return 0;
  467. }
  468. if (is_migration_entry(entry)) {
  469. if (fault || write_fault) {
  470. pte_unmap(ptep);
  471. hmm_vma_walk->last = addr;
  472. migration_entry_wait(vma->vm_mm,
  473. pmdp, addr);
  474. return -EAGAIN;
  475. }
  476. return 0;
  477. }
  478. /* Report error for everything else */
  479. *pfn = range->values[HMM_PFN_ERROR];
  480. return -EFAULT;
  481. }
  482. if (fault || write_fault)
  483. goto fault;
  484. *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
  485. return 0;
  486. fault:
  487. pte_unmap(ptep);
  488. /* Fault any virtual address we were asked to fault */
  489. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  490. }
  491. static int hmm_vma_walk_pmd(pmd_t *pmdp,
  492. unsigned long start,
  493. unsigned long end,
  494. struct mm_walk *walk)
  495. {
  496. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  497. struct hmm_range *range = hmm_vma_walk->range;
  498. uint64_t *pfns = range->pfns;
  499. unsigned long addr = start, i;
  500. pte_t *ptep;
  501. i = (addr - range->start) >> PAGE_SHIFT;
  502. again:
  503. if (pmd_none(*pmdp))
  504. return hmm_vma_walk_hole(start, end, walk);
  505. if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
  506. return hmm_pfns_bad(start, end, walk);
  507. if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
  508. pmd_t pmd;
  509. /*
  510. * No need to take pmd_lock here, even if some other threads
  511. * is splitting the huge pmd we will get that event through
  512. * mmu_notifier callback.
  513. *
  514. * So just read pmd value and check again its a transparent
  515. * huge or device mapping one and compute corresponding pfn
  516. * values.
  517. */
  518. pmd = pmd_read_atomic(pmdp);
  519. barrier();
  520. if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
  521. goto again;
  522. return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
  523. }
  524. if (pmd_bad(*pmdp))
  525. return hmm_pfns_bad(start, end, walk);
  526. ptep = pte_offset_map(pmdp, addr);
  527. for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
  528. int r;
  529. r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
  530. if (r) {
  531. /* hmm_vma_handle_pte() did unmap pte directory */
  532. hmm_vma_walk->last = addr;
  533. return r;
  534. }
  535. }
  536. pte_unmap(ptep - 1);
  537. hmm_vma_walk->last = addr;
  538. return 0;
  539. }
  540. static void hmm_pfns_clear(struct hmm_range *range,
  541. uint64_t *pfns,
  542. unsigned long addr,
  543. unsigned long end)
  544. {
  545. for (; addr < end; addr += PAGE_SIZE, pfns++)
  546. *pfns = range->values[HMM_PFN_NONE];
  547. }
  548. static void hmm_pfns_special(struct hmm_range *range)
  549. {
  550. unsigned long addr = range->start, i = 0;
  551. for (; addr < range->end; addr += PAGE_SIZE, i++)
  552. range->pfns[i] = range->values[HMM_PFN_SPECIAL];
  553. }
  554. /*
  555. * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
  556. * @range: range being snapshotted
  557. * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  558. * vma permission, 0 success
  559. *
  560. * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  561. * validity is tracked by range struct. See hmm_vma_range_done() for further
  562. * information.
  563. *
  564. * The range struct is initialized here. It tracks the CPU page table, but only
  565. * if the function returns success (0), in which case the caller must then call
  566. * hmm_vma_range_done() to stop CPU page table update tracking on this range.
  567. *
  568. * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
  569. * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
  570. */
  571. int hmm_vma_get_pfns(struct hmm_range *range)
  572. {
  573. struct vm_area_struct *vma = range->vma;
  574. struct hmm_vma_walk hmm_vma_walk;
  575. struct mm_walk mm_walk;
  576. struct hmm *hmm;
  577. /* Sanity check, this really should not happen ! */
  578. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  579. return -EINVAL;
  580. if (range->end < vma->vm_start || range->end > vma->vm_end)
  581. return -EINVAL;
  582. hmm = hmm_register(vma->vm_mm);
  583. if (!hmm)
  584. return -ENOMEM;
  585. /* Caller must have registered a mirror, via hmm_mirror_register() ! */
  586. if (!hmm->mmu_notifier.ops)
  587. return -EINVAL;
  588. /* FIXME support hugetlb fs */
  589. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
  590. vma_is_dax(vma)) {
  591. hmm_pfns_special(range);
  592. return -EINVAL;
  593. }
  594. if (!(vma->vm_flags & VM_READ)) {
  595. /*
  596. * If vma do not allow read access, then assume that it does
  597. * not allow write access, either. Architecture that allow
  598. * write without read access are not supported by HMM, because
  599. * operations such has atomic access would not work.
  600. */
  601. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  602. return -EPERM;
  603. }
  604. /* Initialize range to track CPU page table update */
  605. spin_lock(&hmm->lock);
  606. range->valid = true;
  607. list_add_rcu(&range->list, &hmm->ranges);
  608. spin_unlock(&hmm->lock);
  609. hmm_vma_walk.fault = false;
  610. hmm_vma_walk.range = range;
  611. mm_walk.private = &hmm_vma_walk;
  612. mm_walk.vma = vma;
  613. mm_walk.mm = vma->vm_mm;
  614. mm_walk.pte_entry = NULL;
  615. mm_walk.test_walk = NULL;
  616. mm_walk.hugetlb_entry = NULL;
  617. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  618. mm_walk.pte_hole = hmm_vma_walk_hole;
  619. walk_page_range(range->start, range->end, &mm_walk);
  620. return 0;
  621. }
  622. EXPORT_SYMBOL(hmm_vma_get_pfns);
  623. /*
  624. * hmm_vma_range_done() - stop tracking change to CPU page table over a range
  625. * @range: range being tracked
  626. * Returns: false if range data has been invalidated, true otherwise
  627. *
  628. * Range struct is used to track updates to the CPU page table after a call to
  629. * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
  630. * using the data, or wants to lock updates to the data it got from those
  631. * functions, it must call the hmm_vma_range_done() function, which will then
  632. * stop tracking CPU page table updates.
  633. *
  634. * Note that device driver must still implement general CPU page table update
  635. * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
  636. * the mmu_notifier API directly.
  637. *
  638. * CPU page table update tracking done through hmm_range is only temporary and
  639. * to be used while trying to duplicate CPU page table contents for a range of
  640. * virtual addresses.
  641. *
  642. * There are two ways to use this :
  643. * again:
  644. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  645. * trans = device_build_page_table_update_transaction(pfns);
  646. * device_page_table_lock();
  647. * if (!hmm_vma_range_done(range)) {
  648. * device_page_table_unlock();
  649. * goto again;
  650. * }
  651. * device_commit_transaction(trans);
  652. * device_page_table_unlock();
  653. *
  654. * Or:
  655. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  656. * device_page_table_lock();
  657. * hmm_vma_range_done(range);
  658. * device_update_page_table(range->pfns);
  659. * device_page_table_unlock();
  660. */
  661. bool hmm_vma_range_done(struct hmm_range *range)
  662. {
  663. unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
  664. struct hmm *hmm;
  665. if (range->end <= range->start) {
  666. BUG();
  667. return false;
  668. }
  669. hmm = hmm_register(range->vma->vm_mm);
  670. if (!hmm) {
  671. memset(range->pfns, 0, sizeof(*range->pfns) * npages);
  672. return false;
  673. }
  674. spin_lock(&hmm->lock);
  675. list_del_rcu(&range->list);
  676. spin_unlock(&hmm->lock);
  677. return range->valid;
  678. }
  679. EXPORT_SYMBOL(hmm_vma_range_done);
  680. /*
  681. * hmm_vma_fault() - try to fault some address in a virtual address range
  682. * @range: range being faulted
  683. * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
  684. * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
  685. *
  686. * This is similar to a regular CPU page fault except that it will not trigger
  687. * any memory migration if the memory being faulted is not accessible by CPUs.
  688. *
  689. * On error, for one virtual address in the range, the function will mark the
  690. * corresponding HMM pfn entry with an error flag.
  691. *
  692. * Expected use pattern:
  693. * retry:
  694. * down_read(&mm->mmap_sem);
  695. * // Find vma and address device wants to fault, initialize hmm_pfn_t
  696. * // array accordingly
  697. * ret = hmm_vma_fault(range, write, block);
  698. * switch (ret) {
  699. * case -EAGAIN:
  700. * hmm_vma_range_done(range);
  701. * // You might want to rate limit or yield to play nicely, you may
  702. * // also commit any valid pfn in the array assuming that you are
  703. * // getting true from hmm_vma_range_monitor_end()
  704. * goto retry;
  705. * case 0:
  706. * break;
  707. * case -ENOMEM:
  708. * case -EINVAL:
  709. * case -EPERM:
  710. * default:
  711. * // Handle error !
  712. * up_read(&mm->mmap_sem)
  713. * return;
  714. * }
  715. * // Take device driver lock that serialize device page table update
  716. * driver_lock_device_page_table_update();
  717. * hmm_vma_range_done(range);
  718. * // Commit pfns we got from hmm_vma_fault()
  719. * driver_unlock_device_page_table_update();
  720. * up_read(&mm->mmap_sem)
  721. *
  722. * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
  723. * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
  724. *
  725. * YOU HAVE BEEN WARNED !
  726. */
  727. int hmm_vma_fault(struct hmm_range *range, bool block)
  728. {
  729. struct vm_area_struct *vma = range->vma;
  730. unsigned long start = range->start;
  731. struct hmm_vma_walk hmm_vma_walk;
  732. struct mm_walk mm_walk;
  733. struct hmm *hmm;
  734. int ret;
  735. /* Sanity check, this really should not happen ! */
  736. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  737. return -EINVAL;
  738. if (range->end < vma->vm_start || range->end > vma->vm_end)
  739. return -EINVAL;
  740. hmm = hmm_register(vma->vm_mm);
  741. if (!hmm) {
  742. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  743. return -ENOMEM;
  744. }
  745. /* Caller must have registered a mirror using hmm_mirror_register() */
  746. if (!hmm->mmu_notifier.ops)
  747. return -EINVAL;
  748. /* FIXME support hugetlb fs */
  749. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
  750. vma_is_dax(vma)) {
  751. hmm_pfns_special(range);
  752. return -EINVAL;
  753. }
  754. if (!(vma->vm_flags & VM_READ)) {
  755. /*
  756. * If vma do not allow read access, then assume that it does
  757. * not allow write access, either. Architecture that allow
  758. * write without read access are not supported by HMM, because
  759. * operations such has atomic access would not work.
  760. */
  761. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  762. return -EPERM;
  763. }
  764. /* Initialize range to track CPU page table update */
  765. spin_lock(&hmm->lock);
  766. range->valid = true;
  767. list_add_rcu(&range->list, &hmm->ranges);
  768. spin_unlock(&hmm->lock);
  769. hmm_vma_walk.fault = true;
  770. hmm_vma_walk.block = block;
  771. hmm_vma_walk.range = range;
  772. mm_walk.private = &hmm_vma_walk;
  773. hmm_vma_walk.last = range->start;
  774. mm_walk.vma = vma;
  775. mm_walk.mm = vma->vm_mm;
  776. mm_walk.pte_entry = NULL;
  777. mm_walk.test_walk = NULL;
  778. mm_walk.hugetlb_entry = NULL;
  779. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  780. mm_walk.pte_hole = hmm_vma_walk_hole;
  781. do {
  782. ret = walk_page_range(start, range->end, &mm_walk);
  783. start = hmm_vma_walk.last;
  784. } while (ret == -EAGAIN);
  785. if (ret) {
  786. unsigned long i;
  787. i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
  788. hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
  789. range->end);
  790. hmm_vma_range_done(range);
  791. }
  792. return ret;
  793. }
  794. EXPORT_SYMBOL(hmm_vma_fault);
  795. #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
  796. #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
  797. struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
  798. unsigned long addr)
  799. {
  800. struct page *page;
  801. page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
  802. if (!page)
  803. return NULL;
  804. lock_page(page);
  805. return page;
  806. }
  807. EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
  808. static void hmm_devmem_ref_release(struct percpu_ref *ref)
  809. {
  810. struct hmm_devmem *devmem;
  811. devmem = container_of(ref, struct hmm_devmem, ref);
  812. complete(&devmem->completion);
  813. }
  814. static void hmm_devmem_ref_exit(void *data)
  815. {
  816. struct percpu_ref *ref = data;
  817. struct hmm_devmem *devmem;
  818. devmem = container_of(ref, struct hmm_devmem, ref);
  819. percpu_ref_exit(ref);
  820. devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
  821. }
  822. static void hmm_devmem_ref_kill(void *data)
  823. {
  824. struct percpu_ref *ref = data;
  825. struct hmm_devmem *devmem;
  826. devmem = container_of(ref, struct hmm_devmem, ref);
  827. percpu_ref_kill(ref);
  828. wait_for_completion(&devmem->completion);
  829. devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
  830. }
  831. static int hmm_devmem_fault(struct vm_area_struct *vma,
  832. unsigned long addr,
  833. const struct page *page,
  834. unsigned int flags,
  835. pmd_t *pmdp)
  836. {
  837. struct hmm_devmem *devmem = page->pgmap->data;
  838. return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
  839. }
  840. static void hmm_devmem_free(struct page *page, void *data)
  841. {
  842. struct hmm_devmem *devmem = data;
  843. devmem->ops->free(devmem, page);
  844. }
  845. static DEFINE_MUTEX(hmm_devmem_lock);
  846. static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
  847. static void hmm_devmem_radix_release(struct resource *resource)
  848. {
  849. resource_size_t key, align_start, align_size;
  850. align_start = resource->start & ~(PA_SECTION_SIZE - 1);
  851. align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
  852. mutex_lock(&hmm_devmem_lock);
  853. for (key = resource->start;
  854. key <= resource->end;
  855. key += PA_SECTION_SIZE)
  856. radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
  857. mutex_unlock(&hmm_devmem_lock);
  858. }
  859. static void hmm_devmem_release(struct device *dev, void *data)
  860. {
  861. struct hmm_devmem *devmem = data;
  862. struct resource *resource = devmem->resource;
  863. unsigned long start_pfn, npages;
  864. struct zone *zone;
  865. struct page *page;
  866. if (percpu_ref_tryget_live(&devmem->ref)) {
  867. dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
  868. percpu_ref_put(&devmem->ref);
  869. }
  870. /* pages are dead and unused, undo the arch mapping */
  871. start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
  872. npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
  873. page = pfn_to_page(start_pfn);
  874. zone = page_zone(page);
  875. mem_hotplug_begin();
  876. if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
  877. __remove_pages(zone, start_pfn, npages, NULL);
  878. else
  879. arch_remove_memory(start_pfn << PAGE_SHIFT,
  880. npages << PAGE_SHIFT, NULL);
  881. mem_hotplug_done();
  882. hmm_devmem_radix_release(resource);
  883. }
  884. static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
  885. {
  886. resource_size_t key, align_start, align_size, align_end;
  887. struct device *device = devmem->device;
  888. int ret, nid, is_ram;
  889. unsigned long pfn;
  890. align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
  891. align_size = ALIGN(devmem->resource->start +
  892. resource_size(devmem->resource),
  893. PA_SECTION_SIZE) - align_start;
  894. is_ram = region_intersects(align_start, align_size,
  895. IORESOURCE_SYSTEM_RAM,
  896. IORES_DESC_NONE);
  897. if (is_ram == REGION_MIXED) {
  898. WARN_ONCE(1, "%s attempted on mixed region %pr\n",
  899. __func__, devmem->resource);
  900. return -ENXIO;
  901. }
  902. if (is_ram == REGION_INTERSECTS)
  903. return -ENXIO;
  904. if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
  905. devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
  906. else
  907. devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
  908. devmem->pagemap.res = *devmem->resource;
  909. devmem->pagemap.page_fault = hmm_devmem_fault;
  910. devmem->pagemap.page_free = hmm_devmem_free;
  911. devmem->pagemap.dev = devmem->device;
  912. devmem->pagemap.ref = &devmem->ref;
  913. devmem->pagemap.data = devmem;
  914. mutex_lock(&hmm_devmem_lock);
  915. align_end = align_start + align_size - 1;
  916. for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
  917. struct hmm_devmem *dup;
  918. dup = radix_tree_lookup(&hmm_devmem_radix,
  919. key >> PA_SECTION_SHIFT);
  920. if (dup) {
  921. dev_err(device, "%s: collides with mapping for %s\n",
  922. __func__, dev_name(dup->device));
  923. mutex_unlock(&hmm_devmem_lock);
  924. ret = -EBUSY;
  925. goto error;
  926. }
  927. ret = radix_tree_insert(&hmm_devmem_radix,
  928. key >> PA_SECTION_SHIFT,
  929. devmem);
  930. if (ret) {
  931. dev_err(device, "%s: failed: %d\n", __func__, ret);
  932. mutex_unlock(&hmm_devmem_lock);
  933. goto error_radix;
  934. }
  935. }
  936. mutex_unlock(&hmm_devmem_lock);
  937. nid = dev_to_node(device);
  938. if (nid < 0)
  939. nid = numa_mem_id();
  940. mem_hotplug_begin();
  941. /*
  942. * For device private memory we call add_pages() as we only need to
  943. * allocate and initialize struct page for the device memory. More-
  944. * over the device memory is un-accessible thus we do not want to
  945. * create a linear mapping for the memory like arch_add_memory()
  946. * would do.
  947. *
  948. * For device public memory, which is accesible by the CPU, we do
  949. * want the linear mapping and thus use arch_add_memory().
  950. */
  951. if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
  952. ret = arch_add_memory(nid, align_start, align_size, NULL,
  953. false);
  954. else
  955. ret = add_pages(nid, align_start >> PAGE_SHIFT,
  956. align_size >> PAGE_SHIFT, NULL, false);
  957. if (ret) {
  958. mem_hotplug_done();
  959. goto error_add_memory;
  960. }
  961. move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
  962. align_start >> PAGE_SHIFT,
  963. align_size >> PAGE_SHIFT, NULL);
  964. mem_hotplug_done();
  965. for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
  966. struct page *page = pfn_to_page(pfn);
  967. page->pgmap = &devmem->pagemap;
  968. }
  969. return 0;
  970. error_add_memory:
  971. untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
  972. error_radix:
  973. hmm_devmem_radix_release(devmem->resource);
  974. error:
  975. return ret;
  976. }
  977. static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
  978. {
  979. struct hmm_devmem *devmem = data;
  980. return devmem->resource == match_data;
  981. }
  982. static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
  983. {
  984. devres_release(devmem->device, &hmm_devmem_release,
  985. &hmm_devmem_match, devmem->resource);
  986. }
  987. /*
  988. * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  989. *
  990. * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  991. * @device: device struct to bind the resource too
  992. * @size: size in bytes of the device memory to add
  993. * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
  994. *
  995. * This function first finds an empty range of physical address big enough to
  996. * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
  997. * in turn allocates struct pages. It does not do anything beyond that; all
  998. * events affecting the memory will go through the various callbacks provided
  999. * by hmm_devmem_ops struct.
  1000. *
  1001. * Device driver should call this function during device initialization and
  1002. * is then responsible of memory management. HMM only provides helpers.
  1003. */
  1004. struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  1005. struct device *device,
  1006. unsigned long size)
  1007. {
  1008. struct hmm_devmem *devmem;
  1009. resource_size_t addr;
  1010. int ret;
  1011. dev_pagemap_get_ops();
  1012. devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
  1013. GFP_KERNEL, dev_to_node(device));
  1014. if (!devmem)
  1015. return ERR_PTR(-ENOMEM);
  1016. init_completion(&devmem->completion);
  1017. devmem->pfn_first = -1UL;
  1018. devmem->pfn_last = -1UL;
  1019. devmem->resource = NULL;
  1020. devmem->device = device;
  1021. devmem->ops = ops;
  1022. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1023. 0, GFP_KERNEL);
  1024. if (ret)
  1025. goto error_percpu_ref;
  1026. ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
  1027. if (ret)
  1028. goto error_devm_add_action;
  1029. size = ALIGN(size, PA_SECTION_SIZE);
  1030. addr = min((unsigned long)iomem_resource.end,
  1031. (1UL << MAX_PHYSMEM_BITS) - 1);
  1032. addr = addr - size + 1UL;
  1033. /*
  1034. * FIXME add a new helper to quickly walk resource tree and find free
  1035. * range
  1036. *
  1037. * FIXME what about ioport_resource resource ?
  1038. */
  1039. for (; addr > size && addr >= iomem_resource.start; addr -= size) {
  1040. ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
  1041. if (ret != REGION_DISJOINT)
  1042. continue;
  1043. devmem->resource = devm_request_mem_region(device, addr, size,
  1044. dev_name(device));
  1045. if (!devmem->resource) {
  1046. ret = -ENOMEM;
  1047. goto error_no_resource;
  1048. }
  1049. break;
  1050. }
  1051. if (!devmem->resource) {
  1052. ret = -ERANGE;
  1053. goto error_no_resource;
  1054. }
  1055. devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
  1056. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1057. devmem->pfn_last = devmem->pfn_first +
  1058. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1059. ret = hmm_devmem_pages_create(devmem);
  1060. if (ret)
  1061. goto error_pages;
  1062. devres_add(device, devmem);
  1063. ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
  1064. if (ret) {
  1065. hmm_devmem_remove(devmem);
  1066. return ERR_PTR(ret);
  1067. }
  1068. return devmem;
  1069. error_pages:
  1070. devm_release_mem_region(device, devmem->resource->start,
  1071. resource_size(devmem->resource));
  1072. error_no_resource:
  1073. error_devm_add_action:
  1074. hmm_devmem_ref_kill(&devmem->ref);
  1075. hmm_devmem_ref_exit(&devmem->ref);
  1076. error_percpu_ref:
  1077. devres_free(devmem);
  1078. return ERR_PTR(ret);
  1079. }
  1080. EXPORT_SYMBOL(hmm_devmem_add);
  1081. struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
  1082. struct device *device,
  1083. struct resource *res)
  1084. {
  1085. struct hmm_devmem *devmem;
  1086. int ret;
  1087. if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
  1088. return ERR_PTR(-EINVAL);
  1089. dev_pagemap_get_ops();
  1090. devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
  1091. GFP_KERNEL, dev_to_node(device));
  1092. if (!devmem)
  1093. return ERR_PTR(-ENOMEM);
  1094. init_completion(&devmem->completion);
  1095. devmem->pfn_first = -1UL;
  1096. devmem->pfn_last = -1UL;
  1097. devmem->resource = res;
  1098. devmem->device = device;
  1099. devmem->ops = ops;
  1100. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1101. 0, GFP_KERNEL);
  1102. if (ret)
  1103. goto error_percpu_ref;
  1104. ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
  1105. if (ret)
  1106. goto error_devm_add_action;
  1107. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1108. devmem->pfn_last = devmem->pfn_first +
  1109. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1110. ret = hmm_devmem_pages_create(devmem);
  1111. if (ret)
  1112. goto error_devm_add_action;
  1113. devres_add(device, devmem);
  1114. ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
  1115. if (ret) {
  1116. hmm_devmem_remove(devmem);
  1117. return ERR_PTR(ret);
  1118. }
  1119. return devmem;
  1120. error_devm_add_action:
  1121. hmm_devmem_ref_kill(&devmem->ref);
  1122. hmm_devmem_ref_exit(&devmem->ref);
  1123. error_percpu_ref:
  1124. devres_free(devmem);
  1125. return ERR_PTR(ret);
  1126. }
  1127. EXPORT_SYMBOL(hmm_devmem_add_resource);
  1128. /*
  1129. * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
  1130. *
  1131. * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
  1132. *
  1133. * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
  1134. * of the device driver. It will free struct page and remove the resource that
  1135. * reserved the physical address range for this device memory.
  1136. */
  1137. void hmm_devmem_remove(struct hmm_devmem *devmem)
  1138. {
  1139. resource_size_t start, size;
  1140. struct device *device;
  1141. bool cdm = false;
  1142. if (!devmem)
  1143. return;
  1144. device = devmem->device;
  1145. start = devmem->resource->start;
  1146. size = resource_size(devmem->resource);
  1147. cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
  1148. hmm_devmem_ref_kill(&devmem->ref);
  1149. hmm_devmem_ref_exit(&devmem->ref);
  1150. hmm_devmem_pages_remove(devmem);
  1151. if (!cdm)
  1152. devm_release_mem_region(device, start, size);
  1153. }
  1154. EXPORT_SYMBOL(hmm_devmem_remove);
  1155. /*
  1156. * A device driver that wants to handle multiple devices memory through a
  1157. * single fake device can use hmm_device to do so. This is purely a helper
  1158. * and it is not needed to make use of any HMM functionality.
  1159. */
  1160. #define HMM_DEVICE_MAX 256
  1161. static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
  1162. static DEFINE_SPINLOCK(hmm_device_lock);
  1163. static struct class *hmm_device_class;
  1164. static dev_t hmm_device_devt;
  1165. static void hmm_device_release(struct device *device)
  1166. {
  1167. struct hmm_device *hmm_device;
  1168. hmm_device = container_of(device, struct hmm_device, device);
  1169. spin_lock(&hmm_device_lock);
  1170. clear_bit(hmm_device->minor, hmm_device_mask);
  1171. spin_unlock(&hmm_device_lock);
  1172. kfree(hmm_device);
  1173. }
  1174. struct hmm_device *hmm_device_new(void *drvdata)
  1175. {
  1176. struct hmm_device *hmm_device;
  1177. hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
  1178. if (!hmm_device)
  1179. return ERR_PTR(-ENOMEM);
  1180. spin_lock(&hmm_device_lock);
  1181. hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
  1182. if (hmm_device->minor >= HMM_DEVICE_MAX) {
  1183. spin_unlock(&hmm_device_lock);
  1184. kfree(hmm_device);
  1185. return ERR_PTR(-EBUSY);
  1186. }
  1187. set_bit(hmm_device->minor, hmm_device_mask);
  1188. spin_unlock(&hmm_device_lock);
  1189. dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
  1190. hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
  1191. hmm_device->minor);
  1192. hmm_device->device.release = hmm_device_release;
  1193. dev_set_drvdata(&hmm_device->device, drvdata);
  1194. hmm_device->device.class = hmm_device_class;
  1195. device_initialize(&hmm_device->device);
  1196. return hmm_device;
  1197. }
  1198. EXPORT_SYMBOL(hmm_device_new);
  1199. void hmm_device_put(struct hmm_device *hmm_device)
  1200. {
  1201. put_device(&hmm_device->device);
  1202. }
  1203. EXPORT_SYMBOL(hmm_device_put);
  1204. static int __init hmm_init(void)
  1205. {
  1206. int ret;
  1207. ret = alloc_chrdev_region(&hmm_device_devt, 0,
  1208. HMM_DEVICE_MAX,
  1209. "hmm_device");
  1210. if (ret)
  1211. return ret;
  1212. hmm_device_class = class_create(THIS_MODULE, "hmm_device");
  1213. if (IS_ERR(hmm_device_class)) {
  1214. unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
  1215. return PTR_ERR(hmm_device_class);
  1216. }
  1217. return 0;
  1218. }
  1219. device_initcall(hmm_init);
  1220. #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */