umem_odp.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. /*
  2. * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. */
  32. #include <linux/types.h>
  33. #include <linux/sched.h>
  34. #include <linux/sched/mm.h>
  35. #include <linux/sched/task.h>
  36. #include <linux/pid.h>
  37. #include <linux/slab.h>
  38. #include <linux/export.h>
  39. #include <linux/vmalloc.h>
  40. #include <linux/hugetlb.h>
  41. #include <linux/interval_tree_generic.h>
  42. #include <rdma/ib_verbs.h>
  43. #include <rdma/ib_umem.h>
  44. #include <rdma/ib_umem_odp.h>
  45. /*
  46. * The ib_umem list keeps track of memory regions for which the HW
  47. * device request to receive notification when the related memory
  48. * mapping is changed.
  49. *
  50. * ib_umem_lock protects the list.
  51. */
  52. static u64 node_start(struct umem_odp_node *n)
  53. {
  54. struct ib_umem_odp *umem_odp =
  55. container_of(n, struct ib_umem_odp, interval_tree);
  56. return ib_umem_start(&umem_odp->umem);
  57. }
  58. /* Note that the representation of the intervals in the interval tree
  59. * considers the ending point as contained in the interval, while the
  60. * function ib_umem_end returns the first address which is not contained
  61. * in the umem.
  62. */
  63. static u64 node_last(struct umem_odp_node *n)
  64. {
  65. struct ib_umem_odp *umem_odp =
  66. container_of(n, struct ib_umem_odp, interval_tree);
  67. return ib_umem_end(&umem_odp->umem) - 1;
  68. }
  69. INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
  70. node_start, node_last, static, rbt_ib_umem)
  71. static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
  72. {
  73. mutex_lock(&umem_odp->umem_mutex);
  74. if (umem_odp->notifiers_count++ == 0)
  75. /*
  76. * Initialize the completion object for waiting on
  77. * notifiers. Since notifier_count is zero, no one should be
  78. * waiting right now.
  79. */
  80. reinit_completion(&umem_odp->notifier_completion);
  81. mutex_unlock(&umem_odp->umem_mutex);
  82. }
  83. static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
  84. {
  85. mutex_lock(&umem_odp->umem_mutex);
  86. /*
  87. * This sequence increase will notify the QP page fault that the page
  88. * that is going to be mapped in the spte could have been freed.
  89. */
  90. ++umem_odp->notifiers_seq;
  91. if (--umem_odp->notifiers_count == 0)
  92. complete_all(&umem_odp->notifier_completion);
  93. mutex_unlock(&umem_odp->umem_mutex);
  94. }
  95. static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
  96. u64 start, u64 end, void *cookie)
  97. {
  98. struct ib_umem *umem = &umem_odp->umem;
  99. /*
  100. * Increase the number of notifiers running, to
  101. * prevent any further fault handling on this MR.
  102. */
  103. ib_umem_notifier_start_account(umem_odp);
  104. umem_odp->dying = 1;
  105. /* Make sure that the fact the umem is dying is out before we release
  106. * all pending page faults. */
  107. smp_wmb();
  108. complete_all(&umem_odp->notifier_completion);
  109. umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
  110. ib_umem_end(umem));
  111. return 0;
  112. }
  113. static void ib_umem_notifier_release(struct mmu_notifier *mn,
  114. struct mm_struct *mm)
  115. {
  116. struct ib_ucontext_per_mm *per_mm =
  117. container_of(mn, struct ib_ucontext_per_mm, mn);
  118. if (!per_mm->context->invalidate_range)
  119. return;
  120. down_read(&per_mm->umem_rwsem);
  121. rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0,
  122. ULLONG_MAX,
  123. ib_umem_notifier_release_trampoline,
  124. true,
  125. NULL);
  126. up_read(&per_mm->umem_rwsem);
  127. }
  128. static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start,
  129. u64 end, void *cookie)
  130. {
  131. ib_umem_notifier_start_account(item);
  132. item->umem.context->invalidate_range(item, start, start + PAGE_SIZE);
  133. ib_umem_notifier_end_account(item);
  134. return 0;
  135. }
  136. static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
  137. u64 start, u64 end, void *cookie)
  138. {
  139. ib_umem_notifier_start_account(item);
  140. item->umem.context->invalidate_range(item, start, end);
  141. return 0;
  142. }
  143. static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
  144. struct mm_struct *mm,
  145. unsigned long start,
  146. unsigned long end,
  147. bool blockable)
  148. {
  149. struct ib_ucontext_per_mm *per_mm =
  150. container_of(mn, struct ib_ucontext_per_mm, mn);
  151. int ret;
  152. if (!per_mm->context->invalidate_range)
  153. return 0;
  154. if (blockable)
  155. down_read(&per_mm->umem_rwsem);
  156. else if (!down_read_trylock(&per_mm->umem_rwsem))
  157. return -EAGAIN;
  158. return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end,
  159. invalidate_range_start_trampoline,
  160. blockable, NULL);
  161. }
  162. static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
  163. u64 end, void *cookie)
  164. {
  165. ib_umem_notifier_end_account(item);
  166. return 0;
  167. }
  168. static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
  169. struct mm_struct *mm,
  170. unsigned long start,
  171. unsigned long end)
  172. {
  173. struct ib_ucontext_per_mm *per_mm =
  174. container_of(mn, struct ib_ucontext_per_mm, mn);
  175. if (!per_mm->context->invalidate_range)
  176. return;
  177. rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
  178. end,
  179. invalidate_range_end_trampoline, true, NULL);
  180. up_read(&per_mm->umem_rwsem);
  181. }
  182. static const struct mmu_notifier_ops ib_umem_notifiers = {
  183. .release = ib_umem_notifier_release,
  184. .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
  185. .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
  186. };
  187. static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
  188. {
  189. struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
  190. struct ib_umem *umem = &umem_odp->umem;
  191. down_write(&per_mm->umem_rwsem);
  192. if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
  193. rbt_ib_umem_insert(&umem_odp->interval_tree,
  194. &per_mm->umem_tree);
  195. up_write(&per_mm->umem_rwsem);
  196. }
  197. static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
  198. {
  199. struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
  200. struct ib_umem *umem = &umem_odp->umem;
  201. down_write(&per_mm->umem_rwsem);
  202. if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
  203. rbt_ib_umem_remove(&umem_odp->interval_tree,
  204. &per_mm->umem_tree);
  205. complete_all(&umem_odp->notifier_completion);
  206. up_write(&per_mm->umem_rwsem);
  207. }
  208. static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
  209. struct mm_struct *mm)
  210. {
  211. struct ib_ucontext_per_mm *per_mm;
  212. int ret;
  213. per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
  214. if (!per_mm)
  215. return ERR_PTR(-ENOMEM);
  216. per_mm->context = ctx;
  217. per_mm->mm = mm;
  218. per_mm->umem_tree = RB_ROOT_CACHED;
  219. init_rwsem(&per_mm->umem_rwsem);
  220. rcu_read_lock();
  221. per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
  222. rcu_read_unlock();
  223. WARN_ON(mm != current->mm);
  224. per_mm->mn.ops = &ib_umem_notifiers;
  225. ret = mmu_notifier_register(&per_mm->mn, per_mm->mm);
  226. if (ret) {
  227. dev_err(&ctx->device->dev,
  228. "Failed to register mmu_notifier %d\n", ret);
  229. goto out_pid;
  230. }
  231. list_add(&per_mm->ucontext_list, &ctx->per_mm_list);
  232. return per_mm;
  233. out_pid:
  234. put_pid(per_mm->tgid);
  235. kfree(per_mm);
  236. return ERR_PTR(ret);
  237. }
  238. static int get_per_mm(struct ib_umem_odp *umem_odp)
  239. {
  240. struct ib_ucontext *ctx = umem_odp->umem.context;
  241. struct ib_ucontext_per_mm *per_mm;
  242. /*
  243. * Generally speaking we expect only one or two per_mm in this list,
  244. * so no reason to optimize this search today.
  245. */
  246. mutex_lock(&ctx->per_mm_list_lock);
  247. list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) {
  248. if (per_mm->mm == umem_odp->umem.owning_mm)
  249. goto found;
  250. }
  251. per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm);
  252. if (IS_ERR(per_mm)) {
  253. mutex_unlock(&ctx->per_mm_list_lock);
  254. return PTR_ERR(per_mm);
  255. }
  256. found:
  257. umem_odp->per_mm = per_mm;
  258. per_mm->odp_mrs_count++;
  259. mutex_unlock(&ctx->per_mm_list_lock);
  260. return 0;
  261. }
  262. void put_per_mm(struct ib_umem_odp *umem_odp)
  263. {
  264. struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
  265. struct ib_ucontext *ctx = umem_odp->umem.context;
  266. bool need_free;
  267. mutex_lock(&ctx->per_mm_list_lock);
  268. umem_odp->per_mm = NULL;
  269. per_mm->odp_mrs_count--;
  270. need_free = per_mm->odp_mrs_count == 0;
  271. if (need_free)
  272. list_del(&per_mm->ucontext_list);
  273. mutex_unlock(&ctx->per_mm_list_lock);
  274. if (!need_free)
  275. return;
  276. mmu_notifier_unregister(&per_mm->mn, per_mm->mm);
  277. put_pid(per_mm->tgid);
  278. kfree(per_mm);
  279. }
  280. struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
  281. unsigned long addr, size_t size)
  282. {
  283. struct ib_ucontext *ctx = per_mm->context;
  284. struct ib_umem_odp *odp_data;
  285. struct ib_umem *umem;
  286. int pages = size >> PAGE_SHIFT;
  287. int ret;
  288. odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
  289. if (!odp_data)
  290. return ERR_PTR(-ENOMEM);
  291. umem = &odp_data->umem;
  292. umem->context = ctx;
  293. umem->length = size;
  294. umem->address = addr;
  295. umem->page_shift = PAGE_SHIFT;
  296. umem->writable = 1;
  297. umem->is_odp = 1;
  298. odp_data->per_mm = per_mm;
  299. mutex_init(&odp_data->umem_mutex);
  300. init_completion(&odp_data->notifier_completion);
  301. odp_data->page_list =
  302. vzalloc(array_size(pages, sizeof(*odp_data->page_list)));
  303. if (!odp_data->page_list) {
  304. ret = -ENOMEM;
  305. goto out_odp_data;
  306. }
  307. odp_data->dma_list =
  308. vzalloc(array_size(pages, sizeof(*odp_data->dma_list)));
  309. if (!odp_data->dma_list) {
  310. ret = -ENOMEM;
  311. goto out_page_list;
  312. }
  313. /*
  314. * Caller must ensure that the umem_odp that the per_mm came from
  315. * cannot be freed during the call to ib_alloc_odp_umem.
  316. */
  317. mutex_lock(&ctx->per_mm_list_lock);
  318. per_mm->odp_mrs_count++;
  319. mutex_unlock(&ctx->per_mm_list_lock);
  320. add_umem_to_per_mm(odp_data);
  321. return odp_data;
  322. out_page_list:
  323. vfree(odp_data->page_list);
  324. out_odp_data:
  325. kfree(odp_data);
  326. return ERR_PTR(ret);
  327. }
  328. EXPORT_SYMBOL(ib_alloc_odp_umem);
  329. int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
  330. {
  331. struct ib_umem *umem = &umem_odp->umem;
  332. /*
  333. * NOTE: This must called in a process context where umem->owning_mm
  334. * == current->mm
  335. */
  336. struct mm_struct *mm = umem->owning_mm;
  337. int ret_val;
  338. if (access & IB_ACCESS_HUGETLB) {
  339. struct vm_area_struct *vma;
  340. struct hstate *h;
  341. down_read(&mm->mmap_sem);
  342. vma = find_vma(mm, ib_umem_start(umem));
  343. if (!vma || !is_vm_hugetlb_page(vma)) {
  344. up_read(&mm->mmap_sem);
  345. return -EINVAL;
  346. }
  347. h = hstate_vma(vma);
  348. umem->page_shift = huge_page_shift(h);
  349. up_read(&mm->mmap_sem);
  350. umem->hugetlb = 1;
  351. } else {
  352. umem->hugetlb = 0;
  353. }
  354. mutex_init(&umem_odp->umem_mutex);
  355. init_completion(&umem_odp->notifier_completion);
  356. if (ib_umem_num_pages(umem)) {
  357. umem_odp->page_list =
  358. vzalloc(array_size(sizeof(*umem_odp->page_list),
  359. ib_umem_num_pages(umem)));
  360. if (!umem_odp->page_list)
  361. return -ENOMEM;
  362. umem_odp->dma_list =
  363. vzalloc(array_size(sizeof(*umem_odp->dma_list),
  364. ib_umem_num_pages(umem)));
  365. if (!umem_odp->dma_list) {
  366. ret_val = -ENOMEM;
  367. goto out_page_list;
  368. }
  369. }
  370. ret_val = get_per_mm(umem_odp);
  371. if (ret_val)
  372. goto out_dma_list;
  373. add_umem_to_per_mm(umem_odp);
  374. return 0;
  375. out_dma_list:
  376. vfree(umem_odp->dma_list);
  377. out_page_list:
  378. vfree(umem_odp->page_list);
  379. return ret_val;
  380. }
  381. void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
  382. {
  383. struct ib_umem *umem = &umem_odp->umem;
  384. /*
  385. * Ensure that no more pages are mapped in the umem.
  386. *
  387. * It is the driver's responsibility to ensure, before calling us,
  388. * that the hardware will not attempt to access the MR any more.
  389. */
  390. ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
  391. ib_umem_end(umem));
  392. remove_umem_from_per_mm(umem_odp);
  393. put_per_mm(umem_odp);
  394. vfree(umem_odp->dma_list);
  395. vfree(umem_odp->page_list);
  396. }
  397. /*
  398. * Map for DMA and insert a single page into the on-demand paging page tables.
  399. *
  400. * @umem: the umem to insert the page to.
  401. * @page_index: index in the umem to add the page to.
  402. * @page: the page struct to map and add.
  403. * @access_mask: access permissions needed for this page.
  404. * @current_seq: sequence number for synchronization with invalidations.
  405. * the sequence number is taken from
  406. * umem_odp->notifiers_seq.
  407. *
  408. * The function returns -EFAULT if the DMA mapping operation fails. It returns
  409. * -EAGAIN if a concurrent invalidation prevents us from updating the page.
  410. *
  411. * The page is released via put_page even if the operation failed. For
  412. * on-demand pinning, the page is released whenever it isn't stored in the
  413. * umem.
  414. */
  415. static int ib_umem_odp_map_dma_single_page(
  416. struct ib_umem_odp *umem_odp,
  417. int page_index,
  418. struct page *page,
  419. u64 access_mask,
  420. unsigned long current_seq)
  421. {
  422. struct ib_umem *umem = &umem_odp->umem;
  423. struct ib_device *dev = umem->context->device;
  424. dma_addr_t dma_addr;
  425. int stored_page = 0;
  426. int remove_existing_mapping = 0;
  427. int ret = 0;
  428. /*
  429. * Note: we avoid writing if seq is different from the initial seq, to
  430. * handle case of a racing notifier. This check also allows us to bail
  431. * early if we have a notifier running in parallel with us.
  432. */
  433. if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
  434. ret = -EAGAIN;
  435. goto out;
  436. }
  437. if (!(umem_odp->dma_list[page_index])) {
  438. dma_addr = ib_dma_map_page(dev,
  439. page,
  440. 0, BIT(umem->page_shift),
  441. DMA_BIDIRECTIONAL);
  442. if (ib_dma_mapping_error(dev, dma_addr)) {
  443. ret = -EFAULT;
  444. goto out;
  445. }
  446. umem_odp->dma_list[page_index] = dma_addr | access_mask;
  447. umem_odp->page_list[page_index] = page;
  448. umem->npages++;
  449. stored_page = 1;
  450. } else if (umem_odp->page_list[page_index] == page) {
  451. umem_odp->dma_list[page_index] |= access_mask;
  452. } else {
  453. pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
  454. umem_odp->page_list[page_index], page);
  455. /* Better remove the mapping now, to prevent any further
  456. * damage. */
  457. remove_existing_mapping = 1;
  458. }
  459. out:
  460. /* On Demand Paging - avoid pinning the page */
  461. if (umem->context->invalidate_range || !stored_page)
  462. put_page(page);
  463. if (remove_existing_mapping && umem->context->invalidate_range) {
  464. invalidate_page_trampoline(
  465. umem_odp,
  466. ib_umem_start(umem) + (page_index >> umem->page_shift),
  467. ib_umem_start(umem) + ((page_index + 1) >>
  468. umem->page_shift),
  469. NULL);
  470. ret = -EAGAIN;
  471. }
  472. return ret;
  473. }
  474. /**
  475. * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
  476. *
  477. * Pins the range of pages passed in the argument, and maps them to
  478. * DMA addresses. The DMA addresses of the mapped pages is updated in
  479. * umem_odp->dma_list.
  480. *
  481. * Returns the number of pages mapped in success, negative error code
  482. * for failure.
  483. * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
  484. * the function from completing its task.
  485. * An -ENOENT error code indicates that userspace process is being terminated
  486. * and mm was already destroyed.
  487. * @umem_odp: the umem to map and pin
  488. * @user_virt: the address from which we need to map.
  489. * @bcnt: the minimal number of bytes to pin and map. The mapping might be
  490. * bigger due to alignment, and may also be smaller in case of an error
  491. * pinning or mapping a page. The actual pages mapped is returned in
  492. * the return value.
  493. * @access_mask: bit mask of the requested access permissions for the given
  494. * range.
  495. * @current_seq: the MMU notifiers sequance value for synchronization with
  496. * invalidations. the sequance number is read from
  497. * umem_odp->notifiers_seq before calling this function
  498. */
  499. int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
  500. u64 bcnt, u64 access_mask,
  501. unsigned long current_seq)
  502. {
  503. struct ib_umem *umem = &umem_odp->umem;
  504. struct task_struct *owning_process = NULL;
  505. struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
  506. struct page **local_page_list = NULL;
  507. u64 page_mask, off;
  508. int j, k, ret = 0, start_idx, npages = 0, page_shift;
  509. unsigned int flags = 0;
  510. phys_addr_t p = 0;
  511. if (access_mask == 0)
  512. return -EINVAL;
  513. if (user_virt < ib_umem_start(umem) ||
  514. user_virt + bcnt > ib_umem_end(umem))
  515. return -EFAULT;
  516. local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
  517. if (!local_page_list)
  518. return -ENOMEM;
  519. page_shift = umem->page_shift;
  520. page_mask = ~(BIT(page_shift) - 1);
  521. off = user_virt & (~page_mask);
  522. user_virt = user_virt & page_mask;
  523. bcnt += off; /* Charge for the first page offset as well. */
  524. /*
  525. * owning_process is allowed to be NULL, this means somehow the mm is
  526. * existing beyond the lifetime of the originating process.. Presumably
  527. * mmget_not_zero will fail in this case.
  528. */
  529. owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
  530. if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) {
  531. ret = -EINVAL;
  532. goto out_put_task;
  533. }
  534. if (access_mask & ODP_WRITE_ALLOWED_BIT)
  535. flags |= FOLL_WRITE;
  536. start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
  537. k = start_idx;
  538. while (bcnt > 0) {
  539. const size_t gup_num_pages = min_t(size_t,
  540. (bcnt + BIT(page_shift) - 1) >> page_shift,
  541. PAGE_SIZE / sizeof(struct page *));
  542. down_read(&owning_mm->mmap_sem);
  543. /*
  544. * Note: this might result in redundent page getting. We can
  545. * avoid this by checking dma_list to be 0 before calling
  546. * get_user_pages. However, this make the code much more
  547. * complex (and doesn't gain us much performance in most use
  548. * cases).
  549. */
  550. npages = get_user_pages_remote(owning_process, owning_mm,
  551. user_virt, gup_num_pages,
  552. flags, local_page_list, NULL, NULL);
  553. up_read(&owning_mm->mmap_sem);
  554. if (npages < 0)
  555. break;
  556. bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
  557. mutex_lock(&umem_odp->umem_mutex);
  558. for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
  559. if (user_virt & ~page_mask) {
  560. p += PAGE_SIZE;
  561. if (page_to_phys(local_page_list[j]) != p) {
  562. ret = -EFAULT;
  563. break;
  564. }
  565. put_page(local_page_list[j]);
  566. continue;
  567. }
  568. ret = ib_umem_odp_map_dma_single_page(
  569. umem_odp, k, local_page_list[j],
  570. access_mask, current_seq);
  571. if (ret < 0)
  572. break;
  573. p = page_to_phys(local_page_list[j]);
  574. k++;
  575. }
  576. mutex_unlock(&umem_odp->umem_mutex);
  577. if (ret < 0) {
  578. /* Release left over pages when handling errors. */
  579. for (++j; j < npages; ++j)
  580. put_page(local_page_list[j]);
  581. break;
  582. }
  583. }
  584. if (ret >= 0) {
  585. if (npages < 0 && k == start_idx)
  586. ret = npages;
  587. else
  588. ret = k - start_idx;
  589. }
  590. mmput(owning_mm);
  591. out_put_task:
  592. if (owning_process)
  593. put_task_struct(owning_process);
  594. free_page((unsigned long)local_page_list);
  595. return ret;
  596. }
  597. EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
  598. void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
  599. u64 bound)
  600. {
  601. struct ib_umem *umem = &umem_odp->umem;
  602. int idx;
  603. u64 addr;
  604. struct ib_device *dev = umem->context->device;
  605. virt = max_t(u64, virt, ib_umem_start(umem));
  606. bound = min_t(u64, bound, ib_umem_end(umem));
  607. /* Note that during the run of this function, the
  608. * notifiers_count of the MR is > 0, preventing any racing
  609. * faults from completion. We might be racing with other
  610. * invalidations, so we must make sure we free each page only
  611. * once. */
  612. mutex_lock(&umem_odp->umem_mutex);
  613. for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
  614. idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
  615. if (umem_odp->page_list[idx]) {
  616. struct page *page = umem_odp->page_list[idx];
  617. dma_addr_t dma = umem_odp->dma_list[idx];
  618. dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
  619. WARN_ON(!dma_addr);
  620. ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
  621. DMA_BIDIRECTIONAL);
  622. if (dma & ODP_WRITE_ALLOWED_BIT) {
  623. struct page *head_page = compound_head(page);
  624. /*
  625. * set_page_dirty prefers being called with
  626. * the page lock. However, MMU notifiers are
  627. * called sometimes with and sometimes without
  628. * the lock. We rely on the umem_mutex instead
  629. * to prevent other mmu notifiers from
  630. * continuing and allowing the page mapping to
  631. * be removed.
  632. */
  633. set_page_dirty(head_page);
  634. }
  635. /* on demand pinning support */
  636. if (!umem->context->invalidate_range)
  637. put_page(page);
  638. umem_odp->page_list[idx] = NULL;
  639. umem_odp->dma_list[idx] = 0;
  640. umem->npages--;
  641. }
  642. }
  643. mutex_unlock(&umem_odp->umem_mutex);
  644. }
  645. EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
  646. /* @last is not a part of the interval. See comment for function
  647. * node_last.
  648. */
  649. int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
  650. u64 start, u64 last,
  651. umem_call_back cb,
  652. bool blockable,
  653. void *cookie)
  654. {
  655. int ret_val = 0;
  656. struct umem_odp_node *node, *next;
  657. struct ib_umem_odp *umem;
  658. if (unlikely(start == last))
  659. return ret_val;
  660. for (node = rbt_ib_umem_iter_first(root, start, last - 1);
  661. node; node = next) {
  662. /* TODO move the blockable decision up to the callback */
  663. if (!blockable)
  664. return -EAGAIN;
  665. next = rbt_ib_umem_iter_next(node, start, last - 1);
  666. umem = container_of(node, struct ib_umem_odp, interval_tree);
  667. ret_val = cb(umem, start, last, cookie) || ret_val;
  668. }
  669. return ret_val;
  670. }
  671. EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
  672. struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
  673. u64 addr, u64 length)
  674. {
  675. struct umem_odp_node *node;
  676. node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
  677. if (node)
  678. return container_of(node, struct ib_umem_odp, interval_tree);
  679. return NULL;
  680. }
  681. EXPORT_SYMBOL(rbt_ib_umem_lookup);