fmr_ops.c 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2015 Oracle. All rights reserved.
  4. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  5. */
  6. /* Lightweight memory registration using Fast Memory Regions (FMR).
  7. * Referred to sometimes as MTHCAFMR mode.
  8. *
  9. * FMR uses synchronous memory registration and deregistration.
  10. * FMR registration is known to be fast, but FMR deregistration
  11. * can take tens of usecs to complete.
  12. */
  13. /* Normal operation
  14. *
  15. * A Memory Region is prepared for RDMA READ or WRITE using the
  16. * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
  17. * finished, the Memory Region is unmapped using the ib_unmap_fmr
  18. * verb (fmr_op_unmap).
  19. */
  20. #include "xprt_rdma.h"
  21. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  22. # define RPCDBG_FACILITY RPCDBG_TRANS
  23. #endif
  24. /* Maximum scatter/gather per FMR */
  25. #define RPCRDMA_MAX_FMR_SGES (64)
  26. /* Access mode of externally registered pages */
  27. enum {
  28. RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
  29. IB_ACCESS_REMOTE_READ,
  30. };
  31. bool
  32. fmr_is_supported(struct rpcrdma_ia *ia)
  33. {
  34. if (!ia->ri_device->alloc_fmr) {
  35. pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
  36. ia->ri_device->name);
  37. return false;
  38. }
  39. return true;
  40. }
  41. static int
  42. fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
  43. {
  44. static struct ib_fmr_attr fmr_attr = {
  45. .max_pages = RPCRDMA_MAX_FMR_SGES,
  46. .max_maps = 1,
  47. .page_shift = PAGE_SHIFT
  48. };
  49. mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
  50. sizeof(u64), GFP_KERNEL);
  51. if (!mw->fmr.fm_physaddrs)
  52. goto out_free;
  53. mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
  54. sizeof(*mw->mw_sg), GFP_KERNEL);
  55. if (!mw->mw_sg)
  56. goto out_free;
  57. sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
  58. mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
  59. &fmr_attr);
  60. if (IS_ERR(mw->fmr.fm_mr))
  61. goto out_fmr_err;
  62. return 0;
  63. out_fmr_err:
  64. dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
  65. PTR_ERR(mw->fmr.fm_mr));
  66. out_free:
  67. kfree(mw->mw_sg);
  68. kfree(mw->fmr.fm_physaddrs);
  69. return -ENOMEM;
  70. }
  71. static int
  72. __fmr_unmap(struct rpcrdma_mw *mw)
  73. {
  74. LIST_HEAD(l);
  75. int rc;
  76. list_add(&mw->fmr.fm_mr->list, &l);
  77. rc = ib_unmap_fmr(&l);
  78. list_del(&mw->fmr.fm_mr->list);
  79. return rc;
  80. }
  81. static void
  82. fmr_op_release_mr(struct rpcrdma_mw *r)
  83. {
  84. LIST_HEAD(unmap_list);
  85. int rc;
  86. /* Ensure MW is not on any rl_registered list */
  87. if (!list_empty(&r->mw_list))
  88. list_del(&r->mw_list);
  89. kfree(r->fmr.fm_physaddrs);
  90. kfree(r->mw_sg);
  91. /* In case this one was left mapped, try to unmap it
  92. * to prevent dealloc_fmr from failing with EBUSY
  93. */
  94. rc = __fmr_unmap(r);
  95. if (rc)
  96. pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
  97. r, rc);
  98. rc = ib_dealloc_fmr(r->fmr.fm_mr);
  99. if (rc)
  100. pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
  101. r, rc);
  102. kfree(r);
  103. }
  104. /* Reset of a single FMR.
  105. */
  106. static void
  107. fmr_op_recover_mr(struct rpcrdma_mw *mw)
  108. {
  109. struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
  110. int rc;
  111. /* ORDER: invalidate first */
  112. rc = __fmr_unmap(mw);
  113. /* ORDER: then DMA unmap */
  114. ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
  115. mw->mw_sg, mw->mw_nents, mw->mw_dir);
  116. if (rc)
  117. goto out_release;
  118. rpcrdma_put_mw(r_xprt, mw);
  119. r_xprt->rx_stats.mrs_recovered++;
  120. return;
  121. out_release:
  122. pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
  123. r_xprt->rx_stats.mrs_orphaned++;
  124. spin_lock(&r_xprt->rx_buf.rb_mwlock);
  125. list_del(&mw->mw_all);
  126. spin_unlock(&r_xprt->rx_buf.rb_mwlock);
  127. fmr_op_release_mr(mw);
  128. }
  129. static int
  130. fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
  131. struct rpcrdma_create_data_internal *cdata)
  132. {
  133. ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
  134. RPCRDMA_MAX_FMR_SGES);
  135. return 0;
  136. }
  137. /* FMR mode conveys up to 64 pages of payload per chunk segment.
  138. */
  139. static size_t
  140. fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
  141. {
  142. return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
  143. RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
  144. }
  145. /* Use the ib_map_phys_fmr() verb to register a memory region
  146. * for remote access via RDMA READ or RDMA WRITE.
  147. */
  148. static struct rpcrdma_mr_seg *
  149. fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
  150. int nsegs, bool writing, struct rpcrdma_mw **out)
  151. {
  152. struct rpcrdma_mr_seg *seg1 = seg;
  153. int len, pageoff, i, rc;
  154. struct rpcrdma_mw *mw;
  155. u64 *dma_pages;
  156. mw = rpcrdma_get_mw(r_xprt);
  157. if (!mw)
  158. return ERR_PTR(-ENOBUFS);
  159. pageoff = offset_in_page(seg1->mr_offset);
  160. seg1->mr_offset -= pageoff; /* start of page */
  161. seg1->mr_len += pageoff;
  162. len = -pageoff;
  163. if (nsegs > RPCRDMA_MAX_FMR_SGES)
  164. nsegs = RPCRDMA_MAX_FMR_SGES;
  165. for (i = 0; i < nsegs;) {
  166. if (seg->mr_page)
  167. sg_set_page(&mw->mw_sg[i],
  168. seg->mr_page,
  169. seg->mr_len,
  170. offset_in_page(seg->mr_offset));
  171. else
  172. sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
  173. seg->mr_len);
  174. len += seg->mr_len;
  175. ++seg;
  176. ++i;
  177. /* Check for holes */
  178. if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
  179. offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
  180. break;
  181. }
  182. mw->mw_dir = rpcrdma_data_dir(writing);
  183. mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
  184. mw->mw_sg, i, mw->mw_dir);
  185. if (!mw->mw_nents)
  186. goto out_dmamap_err;
  187. for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
  188. dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
  189. rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
  190. dma_pages[0]);
  191. if (rc)
  192. goto out_maperr;
  193. mw->mw_handle = mw->fmr.fm_mr->rkey;
  194. mw->mw_length = len;
  195. mw->mw_offset = dma_pages[0] + pageoff;
  196. *out = mw;
  197. return seg;
  198. out_dmamap_err:
  199. pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
  200. mw->mw_sg, i);
  201. rpcrdma_put_mw(r_xprt, mw);
  202. return ERR_PTR(-EIO);
  203. out_maperr:
  204. pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
  205. len, (unsigned long long)dma_pages[0],
  206. pageoff, mw->mw_nents, rc);
  207. ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
  208. mw->mw_sg, mw->mw_nents, mw->mw_dir);
  209. rpcrdma_put_mw(r_xprt, mw);
  210. return ERR_PTR(-EIO);
  211. }
  212. /* Invalidate all memory regions that were registered for "req".
  213. *
  214. * Sleeps until it is safe for the host CPU to access the
  215. * previously mapped memory regions.
  216. *
  217. * Caller ensures that @mws is not empty before the call. This
  218. * function empties the list.
  219. */
  220. static void
  221. fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
  222. {
  223. struct rpcrdma_mw *mw;
  224. LIST_HEAD(unmap_list);
  225. int rc;
  226. /* ORDER: Invalidate all of the req's MRs first
  227. *
  228. * ib_unmap_fmr() is slow, so use a single call instead
  229. * of one call per mapped FMR.
  230. */
  231. list_for_each_entry(mw, mws, mw_list) {
  232. dprintk("RPC: %s: unmapping fmr %p\n",
  233. __func__, &mw->fmr);
  234. list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
  235. }
  236. r_xprt->rx_stats.local_inv_needed++;
  237. rc = ib_unmap_fmr(&unmap_list);
  238. if (rc)
  239. goto out_reset;
  240. /* ORDER: Now DMA unmap all of the req's MRs, and return
  241. * them to the free MW list.
  242. */
  243. while (!list_empty(mws)) {
  244. mw = rpcrdma_pop_mw(mws);
  245. dprintk("RPC: %s: DMA unmapping fmr %p\n",
  246. __func__, &mw->fmr);
  247. list_del(&mw->fmr.fm_mr->list);
  248. ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
  249. mw->mw_sg, mw->mw_nents, mw->mw_dir);
  250. rpcrdma_put_mw(r_xprt, mw);
  251. }
  252. return;
  253. out_reset:
  254. pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
  255. while (!list_empty(mws)) {
  256. mw = rpcrdma_pop_mw(mws);
  257. list_del(&mw->fmr.fm_mr->list);
  258. fmr_op_recover_mr(mw);
  259. }
  260. }
  261. const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
  262. .ro_map = fmr_op_map,
  263. .ro_unmap_sync = fmr_op_unmap_sync,
  264. .ro_recover_mr = fmr_op_recover_mr,
  265. .ro_open = fmr_op_open,
  266. .ro_maxpages = fmr_op_maxpages,
  267. .ro_init_mr = fmr_op_init_mr,
  268. .ro_release_mr = fmr_op_release_mr,
  269. .ro_displayname = "fmr",
  270. .ro_send_w_inv_ok = 0,
  271. };