verbs.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483
  1. /*
  2. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the BSD-type
  8. * license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * Redistributions in binary form must reproduce the above
  18. * copyright notice, this list of conditions and the following
  19. * disclaimer in the documentation and/or other materials provided
  20. * with the distribution.
  21. *
  22. * Neither the name of the Network Appliance, Inc. nor the names of
  23. * its contributors may be used to endorse or promote products
  24. * derived from this software without specific prior written
  25. * permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. /*
  40. * verbs.c
  41. *
  42. * Encapsulates the major functions managing:
  43. * o adapters
  44. * o endpoints
  45. * o connections
  46. * o buffer memory
  47. */
  48. #include <linux/interrupt.h>
  49. #include <linux/slab.h>
  50. #include <linux/prefetch.h>
  51. #include <linux/sunrpc/addr.h>
  52. #include <linux/sunrpc/svc_rdma.h>
  53. #include <asm/bitops.h>
  54. #include <rdma/ib_cm.h>
  55. #include "xprt_rdma.h"
  56. /*
  57. * Globals/Macros
  58. */
  59. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  60. # define RPCDBG_FACILITY RPCDBG_TRANS
  61. #endif
  62. /*
  63. * internal functions
  64. */
  65. static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt);
  66. static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf);
  67. static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
  68. static struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
  69. int
  70. rpcrdma_alloc_wq(void)
  71. {
  72. struct workqueue_struct *recv_wq;
  73. recv_wq = alloc_workqueue("xprtrdma_receive",
  74. WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
  75. 0);
  76. if (!recv_wq)
  77. return -ENOMEM;
  78. rpcrdma_receive_wq = recv_wq;
  79. return 0;
  80. }
  81. void
  82. rpcrdma_destroy_wq(void)
  83. {
  84. struct workqueue_struct *wq;
  85. if (rpcrdma_receive_wq) {
  86. wq = rpcrdma_receive_wq;
  87. rpcrdma_receive_wq = NULL;
  88. destroy_workqueue(wq);
  89. }
  90. }
  91. static void
  92. rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
  93. {
  94. struct rpcrdma_ep *ep = context;
  95. pr_err("rpcrdma: %s on device %s ep %p\n",
  96. ib_event_msg(event->event), event->device->name, context);
  97. if (ep->rep_connected == 1) {
  98. ep->rep_connected = -EIO;
  99. rpcrdma_conn_func(ep);
  100. wake_up_all(&ep->rep_connect_wait);
  101. }
  102. }
  103. /**
  104. * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
  105. * @cq: completion queue (ignored)
  106. * @wc: completed WR
  107. *
  108. */
  109. static void
  110. rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
  111. {
  112. /* WARNING: Only wr_cqe and status are reliable at this point */
  113. if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
  114. pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
  115. ib_wc_status_msg(wc->status),
  116. wc->status, wc->vendor_err);
  117. }
  118. /* Perform basic sanity checking to avoid using garbage
  119. * to update the credit grant value.
  120. */
  121. static void
  122. rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
  123. {
  124. struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf);
  125. struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
  126. u32 credits;
  127. if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
  128. return;
  129. credits = be32_to_cpu(rmsgp->rm_credit);
  130. if (credits == 0)
  131. credits = 1; /* don't deadlock */
  132. else if (credits > buffer->rb_max_requests)
  133. credits = buffer->rb_max_requests;
  134. atomic_set(&buffer->rb_credits, credits);
  135. }
  136. /**
  137. * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
  138. * @cq: completion queue (ignored)
  139. * @wc: completed WR
  140. *
  141. */
  142. static void
  143. rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
  144. {
  145. struct ib_cqe *cqe = wc->wr_cqe;
  146. struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
  147. rr_cqe);
  148. /* WARNING: Only wr_id and status are reliable at this point */
  149. if (wc->status != IB_WC_SUCCESS)
  150. goto out_fail;
  151. /* status == SUCCESS means all fields in wc are trustworthy */
  152. if (wc->opcode != IB_WC_RECV)
  153. return;
  154. dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
  155. __func__, rep, wc->byte_len);
  156. rep->rr_len = wc->byte_len;
  157. rep->rr_wc_flags = wc->wc_flags;
  158. rep->rr_inv_rkey = wc->ex.invalidate_rkey;
  159. ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
  160. rdmab_addr(rep->rr_rdmabuf),
  161. rep->rr_len, DMA_FROM_DEVICE);
  162. rpcrdma_update_granted_credits(rep);
  163. out_schedule:
  164. queue_work(rpcrdma_receive_wq, &rep->rr_work);
  165. return;
  166. out_fail:
  167. if (wc->status != IB_WC_WR_FLUSH_ERR)
  168. pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
  169. ib_wc_status_msg(wc->status),
  170. wc->status, wc->vendor_err);
  171. rep->rr_len = RPCRDMA_BAD_LEN;
  172. goto out_schedule;
  173. }
  174. static void
  175. rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
  176. struct rdma_conn_param *param)
  177. {
  178. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  179. const struct rpcrdma_connect_private *pmsg = param->private_data;
  180. unsigned int rsize, wsize;
  181. /* Default settings for RPC-over-RDMA Version One */
  182. r_xprt->rx_ia.ri_reminv_expected = false;
  183. r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
  184. rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
  185. wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
  186. if (pmsg &&
  187. pmsg->cp_magic == rpcrdma_cmp_magic &&
  188. pmsg->cp_version == RPCRDMA_CMP_VERSION) {
  189. r_xprt->rx_ia.ri_reminv_expected = true;
  190. r_xprt->rx_ia.ri_implicit_roundup = true;
  191. rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
  192. wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
  193. }
  194. if (rsize < cdata->inline_rsize)
  195. cdata->inline_rsize = rsize;
  196. if (wsize < cdata->inline_wsize)
  197. cdata->inline_wsize = wsize;
  198. dprintk("RPC: %s: max send %u, max recv %u\n",
  199. __func__, cdata->inline_wsize, cdata->inline_rsize);
  200. rpcrdma_set_max_header_sizes(r_xprt);
  201. }
  202. static int
  203. rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
  204. {
  205. struct rpcrdma_xprt *xprt = id->context;
  206. struct rpcrdma_ia *ia = &xprt->rx_ia;
  207. struct rpcrdma_ep *ep = &xprt->rx_ep;
  208. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  209. struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
  210. #endif
  211. struct ib_qp_attr *attr = &ia->ri_qp_attr;
  212. struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
  213. int connstate = 0;
  214. switch (event->event) {
  215. case RDMA_CM_EVENT_ADDR_RESOLVED:
  216. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  217. ia->ri_async_rc = 0;
  218. complete(&ia->ri_done);
  219. break;
  220. case RDMA_CM_EVENT_ADDR_ERROR:
  221. ia->ri_async_rc = -EHOSTUNREACH;
  222. dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
  223. __func__, ep);
  224. complete(&ia->ri_done);
  225. break;
  226. case RDMA_CM_EVENT_ROUTE_ERROR:
  227. ia->ri_async_rc = -ENETUNREACH;
  228. dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
  229. __func__, ep);
  230. complete(&ia->ri_done);
  231. break;
  232. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  233. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  234. pr_info("rpcrdma: removing device for %pIS:%u\n",
  235. sap, rpc_get_port(sap));
  236. #endif
  237. set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
  238. ep->rep_connected = -ENODEV;
  239. xprt_force_disconnect(&xprt->rx_xprt);
  240. wait_for_completion(&ia->ri_remove_done);
  241. ia->ri_id = NULL;
  242. ia->ri_pd = NULL;
  243. ia->ri_device = NULL;
  244. /* Return 1 to ensure the core destroys the id. */
  245. return 1;
  246. case RDMA_CM_EVENT_ESTABLISHED:
  247. connstate = 1;
  248. ib_query_qp(ia->ri_id->qp, attr,
  249. IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
  250. iattr);
  251. dprintk("RPC: %s: %d responder resources"
  252. " (%d initiator)\n",
  253. __func__, attr->max_dest_rd_atomic,
  254. attr->max_rd_atomic);
  255. rpcrdma_update_connect_private(xprt, &event->param.conn);
  256. goto connected;
  257. case RDMA_CM_EVENT_CONNECT_ERROR:
  258. connstate = -ENOTCONN;
  259. goto connected;
  260. case RDMA_CM_EVENT_UNREACHABLE:
  261. connstate = -ENETDOWN;
  262. goto connected;
  263. case RDMA_CM_EVENT_REJECTED:
  264. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  265. pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n",
  266. sap, rpc_get_port(sap), ia->ri_device->name,
  267. rdma_reject_msg(id, event->status));
  268. #endif
  269. connstate = -ECONNREFUSED;
  270. if (event->status == IB_CM_REJ_STALE_CONN)
  271. connstate = -EAGAIN;
  272. goto connected;
  273. case RDMA_CM_EVENT_DISCONNECTED:
  274. connstate = -ECONNABORTED;
  275. connected:
  276. dprintk("RPC: %s: %sconnected\n",
  277. __func__, connstate > 0 ? "" : "dis");
  278. atomic_set(&xprt->rx_buf.rb_credits, 1);
  279. ep->rep_connected = connstate;
  280. rpcrdma_conn_func(ep);
  281. wake_up_all(&ep->rep_connect_wait);
  282. /*FALLTHROUGH*/
  283. default:
  284. dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
  285. __func__, sap, rpc_get_port(sap), ep,
  286. rdma_event_msg(event->event));
  287. break;
  288. }
  289. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  290. if (connstate == 1) {
  291. int ird = attr->max_dest_rd_atomic;
  292. int tird = ep->rep_remote_cma.responder_resources;
  293. pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
  294. sap, rpc_get_port(sap),
  295. ia->ri_device->name,
  296. ia->ri_ops->ro_displayname,
  297. xprt->rx_buf.rb_max_requests,
  298. ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
  299. } else if (connstate < 0) {
  300. pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
  301. sap, rpc_get_port(sap), connstate);
  302. }
  303. #endif
  304. return 0;
  305. }
  306. static struct rdma_cm_id *
  307. rpcrdma_create_id(struct rpcrdma_xprt *xprt,
  308. struct rpcrdma_ia *ia, struct sockaddr *addr)
  309. {
  310. unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
  311. struct rdma_cm_id *id;
  312. int rc;
  313. init_completion(&ia->ri_done);
  314. init_completion(&ia->ri_remove_done);
  315. id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
  316. IB_QPT_RC);
  317. if (IS_ERR(id)) {
  318. rc = PTR_ERR(id);
  319. dprintk("RPC: %s: rdma_create_id() failed %i\n",
  320. __func__, rc);
  321. return id;
  322. }
  323. ia->ri_async_rc = -ETIMEDOUT;
  324. rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
  325. if (rc) {
  326. dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
  327. __func__, rc);
  328. goto out;
  329. }
  330. rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
  331. if (rc < 0) {
  332. dprintk("RPC: %s: wait() exited: %i\n",
  333. __func__, rc);
  334. goto out;
  335. }
  336. rc = ia->ri_async_rc;
  337. if (rc)
  338. goto out;
  339. ia->ri_async_rc = -ETIMEDOUT;
  340. rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
  341. if (rc) {
  342. dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
  343. __func__, rc);
  344. goto out;
  345. }
  346. rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
  347. if (rc < 0) {
  348. dprintk("RPC: %s: wait() exited: %i\n",
  349. __func__, rc);
  350. goto out;
  351. }
  352. rc = ia->ri_async_rc;
  353. if (rc)
  354. goto out;
  355. return id;
  356. out:
  357. rdma_destroy_id(id);
  358. return ERR_PTR(rc);
  359. }
  360. /*
  361. * Exported functions.
  362. */
  363. /**
  364. * rpcrdma_ia_open - Open and initialize an Interface Adapter.
  365. * @xprt: controlling transport
  366. * @addr: IP address of remote peer
  367. *
  368. * Returns 0 on success, negative errno if an appropriate
  369. * Interface Adapter could not be found and opened.
  370. */
  371. int
  372. rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
  373. {
  374. struct rpcrdma_ia *ia = &xprt->rx_ia;
  375. int rc;
  376. ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
  377. if (IS_ERR(ia->ri_id)) {
  378. rc = PTR_ERR(ia->ri_id);
  379. goto out_err;
  380. }
  381. ia->ri_device = ia->ri_id->device;
  382. ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
  383. if (IS_ERR(ia->ri_pd)) {
  384. rc = PTR_ERR(ia->ri_pd);
  385. pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
  386. goto out_err;
  387. }
  388. switch (xprt_rdma_memreg_strategy) {
  389. case RPCRDMA_FRMR:
  390. if (frwr_is_supported(ia)) {
  391. ia->ri_ops = &rpcrdma_frwr_memreg_ops;
  392. break;
  393. }
  394. /*FALLTHROUGH*/
  395. case RPCRDMA_MTHCAFMR:
  396. if (fmr_is_supported(ia)) {
  397. ia->ri_ops = &rpcrdma_fmr_memreg_ops;
  398. break;
  399. }
  400. /*FALLTHROUGH*/
  401. default:
  402. pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
  403. ia->ri_device->name, xprt_rdma_memreg_strategy);
  404. rc = -EINVAL;
  405. goto out_err;
  406. }
  407. return 0;
  408. out_err:
  409. rpcrdma_ia_close(ia);
  410. return rc;
  411. }
  412. /**
  413. * rpcrdma_ia_remove - Handle device driver unload
  414. * @ia: interface adapter being removed
  415. *
  416. * Divest transport H/W resources associated with this adapter,
  417. * but allow it to be restored later.
  418. */
  419. void
  420. rpcrdma_ia_remove(struct rpcrdma_ia *ia)
  421. {
  422. struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
  423. rx_ia);
  424. struct rpcrdma_ep *ep = &r_xprt->rx_ep;
  425. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  426. struct rpcrdma_req *req;
  427. struct rpcrdma_rep *rep;
  428. cancel_delayed_work_sync(&buf->rb_refresh_worker);
  429. /* This is similar to rpcrdma_ep_destroy, but:
  430. * - Don't cancel the connect worker.
  431. * - Don't call rpcrdma_ep_disconnect, which waits
  432. * for another conn upcall, which will deadlock.
  433. * - rdma_disconnect is unneeded, the underlying
  434. * connection is already gone.
  435. */
  436. if (ia->ri_id->qp) {
  437. ib_drain_qp(ia->ri_id->qp);
  438. rdma_destroy_qp(ia->ri_id);
  439. ia->ri_id->qp = NULL;
  440. }
  441. ib_free_cq(ep->rep_attr.recv_cq);
  442. ib_free_cq(ep->rep_attr.send_cq);
  443. /* The ULP is responsible for ensuring all DMA
  444. * mappings and MRs are gone.
  445. */
  446. list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
  447. rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
  448. list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
  449. rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
  450. rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
  451. rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
  452. }
  453. rpcrdma_destroy_mrs(buf);
  454. /* Allow waiters to continue */
  455. complete(&ia->ri_remove_done);
  456. }
  457. /**
  458. * rpcrdma_ia_close - Clean up/close an IA.
  459. * @ia: interface adapter to close
  460. *
  461. */
  462. void
  463. rpcrdma_ia_close(struct rpcrdma_ia *ia)
  464. {
  465. dprintk("RPC: %s: entering\n", __func__);
  466. if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
  467. if (ia->ri_id->qp)
  468. rdma_destroy_qp(ia->ri_id);
  469. rdma_destroy_id(ia->ri_id);
  470. }
  471. ia->ri_id = NULL;
  472. ia->ri_device = NULL;
  473. /* If the pd is still busy, xprtrdma missed freeing a resource */
  474. if (ia->ri_pd && !IS_ERR(ia->ri_pd))
  475. ib_dealloc_pd(ia->ri_pd);
  476. ia->ri_pd = NULL;
  477. }
  478. /*
  479. * Create unconnected endpoint.
  480. */
  481. int
  482. rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  483. struct rpcrdma_create_data_internal *cdata)
  484. {
  485. struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
  486. unsigned int max_qp_wr, max_sge;
  487. struct ib_cq *sendcq, *recvcq;
  488. int rc;
  489. max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
  490. RPCRDMA_MAX_SEND_SGES);
  491. if (max_sge < RPCRDMA_MIN_SEND_SGES) {
  492. pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
  493. return -ENOMEM;
  494. }
  495. ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
  496. if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
  497. dprintk("RPC: %s: insufficient wqe's available\n",
  498. __func__);
  499. return -ENOMEM;
  500. }
  501. max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
  502. /* check provider's send/recv wr limits */
  503. if (cdata->max_requests > max_qp_wr)
  504. cdata->max_requests = max_qp_wr;
  505. ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
  506. ep->rep_attr.qp_context = ep;
  507. ep->rep_attr.srq = NULL;
  508. ep->rep_attr.cap.max_send_wr = cdata->max_requests;
  509. ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
  510. ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
  511. rc = ia->ri_ops->ro_open(ia, ep, cdata);
  512. if (rc)
  513. return rc;
  514. ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
  515. ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
  516. ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
  517. ep->rep_attr.cap.max_send_sge = max_sge;
  518. ep->rep_attr.cap.max_recv_sge = 1;
  519. ep->rep_attr.cap.max_inline_data = 0;
  520. ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  521. ep->rep_attr.qp_type = IB_QPT_RC;
  522. ep->rep_attr.port_num = ~0;
  523. dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
  524. "iovs: send %d recv %d\n",
  525. __func__,
  526. ep->rep_attr.cap.max_send_wr,
  527. ep->rep_attr.cap.max_recv_wr,
  528. ep->rep_attr.cap.max_send_sge,
  529. ep->rep_attr.cap.max_recv_sge);
  530. /* set trigger for requesting send completion */
  531. ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
  532. if (ep->rep_cqinit <= 2)
  533. ep->rep_cqinit = 0; /* always signal? */
  534. rpcrdma_init_cqcount(ep, 0);
  535. init_waitqueue_head(&ep->rep_connect_wait);
  536. INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
  537. sendcq = ib_alloc_cq(ia->ri_device, NULL,
  538. ep->rep_attr.cap.max_send_wr + 1,
  539. 0, IB_POLL_SOFTIRQ);
  540. if (IS_ERR(sendcq)) {
  541. rc = PTR_ERR(sendcq);
  542. dprintk("RPC: %s: failed to create send CQ: %i\n",
  543. __func__, rc);
  544. goto out1;
  545. }
  546. recvcq = ib_alloc_cq(ia->ri_device, NULL,
  547. ep->rep_attr.cap.max_recv_wr + 1,
  548. 0, IB_POLL_SOFTIRQ);
  549. if (IS_ERR(recvcq)) {
  550. rc = PTR_ERR(recvcq);
  551. dprintk("RPC: %s: failed to create recv CQ: %i\n",
  552. __func__, rc);
  553. goto out2;
  554. }
  555. ep->rep_attr.send_cq = sendcq;
  556. ep->rep_attr.recv_cq = recvcq;
  557. /* Initialize cma parameters */
  558. memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
  559. /* Prepare RDMA-CM private message */
  560. pmsg->cp_magic = rpcrdma_cmp_magic;
  561. pmsg->cp_version = RPCRDMA_CMP_VERSION;
  562. pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
  563. pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
  564. pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
  565. ep->rep_remote_cma.private_data = pmsg;
  566. ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
  567. /* Client offers RDMA Read but does not initiate */
  568. ep->rep_remote_cma.initiator_depth = 0;
  569. if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
  570. ep->rep_remote_cma.responder_resources = 32;
  571. else
  572. ep->rep_remote_cma.responder_resources =
  573. ia->ri_device->attrs.max_qp_rd_atom;
  574. /* Limit transport retries so client can detect server
  575. * GID changes quickly. RPC layer handles re-establishing
  576. * transport connection and retransmission.
  577. */
  578. ep->rep_remote_cma.retry_count = 6;
  579. /* RPC-over-RDMA handles its own flow control. In addition,
  580. * make all RNR NAKs visible so we know that RPC-over-RDMA
  581. * flow control is working correctly (no NAKs should be seen).
  582. */
  583. ep->rep_remote_cma.flow_control = 0;
  584. ep->rep_remote_cma.rnr_retry_count = 0;
  585. return 0;
  586. out2:
  587. ib_free_cq(sendcq);
  588. out1:
  589. return rc;
  590. }
  591. /*
  592. * rpcrdma_ep_destroy
  593. *
  594. * Disconnect and destroy endpoint. After this, the only
  595. * valid operations on the ep are to free it (if dynamically
  596. * allocated) or re-create it.
  597. */
  598. void
  599. rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  600. {
  601. dprintk("RPC: %s: entering, connected is %d\n",
  602. __func__, ep->rep_connected);
  603. cancel_delayed_work_sync(&ep->rep_connect_worker);
  604. if (ia->ri_id->qp) {
  605. rpcrdma_ep_disconnect(ep, ia);
  606. rdma_destroy_qp(ia->ri_id);
  607. ia->ri_id->qp = NULL;
  608. }
  609. ib_free_cq(ep->rep_attr.recv_cq);
  610. ib_free_cq(ep->rep_attr.send_cq);
  611. }
  612. /* Re-establish a connection after a device removal event.
  613. * Unlike a normal reconnection, a fresh PD and a new set
  614. * of MRs and buffers is needed.
  615. */
  616. static int
  617. rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
  618. struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  619. {
  620. struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
  621. int rc, err;
  622. pr_info("%s: r_xprt = %p\n", __func__, r_xprt);
  623. rc = -EHOSTUNREACH;
  624. if (rpcrdma_ia_open(r_xprt, sap))
  625. goto out1;
  626. rc = -ENOMEM;
  627. err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
  628. if (err) {
  629. pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
  630. goto out2;
  631. }
  632. rc = -ENETUNREACH;
  633. err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
  634. if (err) {
  635. pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
  636. goto out3;
  637. }
  638. rpcrdma_create_mrs(r_xprt);
  639. return 0;
  640. out3:
  641. rpcrdma_ep_destroy(ep, ia);
  642. out2:
  643. rpcrdma_ia_close(ia);
  644. out1:
  645. return rc;
  646. }
  647. static int
  648. rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
  649. struct rpcrdma_ia *ia)
  650. {
  651. struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
  652. struct rdma_cm_id *id, *old;
  653. int err, rc;
  654. dprintk("RPC: %s: reconnecting...\n", __func__);
  655. rpcrdma_ep_disconnect(ep, ia);
  656. rc = -EHOSTUNREACH;
  657. id = rpcrdma_create_id(r_xprt, ia, sap);
  658. if (IS_ERR(id))
  659. goto out;
  660. /* As long as the new ID points to the same device as the
  661. * old ID, we can reuse the transport's existing PD and all
  662. * previously allocated MRs. Also, the same device means
  663. * the transport's previous DMA mappings are still valid.
  664. *
  665. * This is a sanity check only. There should be no way these
  666. * point to two different devices here.
  667. */
  668. old = id;
  669. rc = -ENETUNREACH;
  670. if (ia->ri_device != id->device) {
  671. pr_err("rpcrdma: can't reconnect on different device!\n");
  672. goto out_destroy;
  673. }
  674. err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
  675. if (err) {
  676. dprintk("RPC: %s: rdma_create_qp returned %d\n",
  677. __func__, err);
  678. goto out_destroy;
  679. }
  680. /* Atomically replace the transport's ID and QP. */
  681. rc = 0;
  682. old = ia->ri_id;
  683. ia->ri_id = id;
  684. rdma_destroy_qp(old);
  685. out_destroy:
  686. rdma_destroy_id(old);
  687. out:
  688. return rc;
  689. }
  690. /*
  691. * Connect unconnected endpoint.
  692. */
  693. int
  694. rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  695. {
  696. struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
  697. rx_ia);
  698. unsigned int extras;
  699. int rc;
  700. retry:
  701. switch (ep->rep_connected) {
  702. case 0:
  703. dprintk("RPC: %s: connecting...\n", __func__);
  704. rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
  705. if (rc) {
  706. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  707. __func__, rc);
  708. rc = -ENETUNREACH;
  709. goto out_noupdate;
  710. }
  711. break;
  712. case -ENODEV:
  713. rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
  714. if (rc)
  715. goto out_noupdate;
  716. break;
  717. default:
  718. rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
  719. if (rc)
  720. goto out;
  721. }
  722. ep->rep_connected = 0;
  723. rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
  724. if (rc) {
  725. dprintk("RPC: %s: rdma_connect() failed with %i\n",
  726. __func__, rc);
  727. goto out;
  728. }
  729. wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
  730. if (ep->rep_connected <= 0) {
  731. if (ep->rep_connected == -EAGAIN)
  732. goto retry;
  733. rc = ep->rep_connected;
  734. goto out;
  735. }
  736. dprintk("RPC: %s: connected\n", __func__);
  737. extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
  738. if (extras)
  739. rpcrdma_ep_post_extra_recv(r_xprt, extras);
  740. out:
  741. if (rc)
  742. ep->rep_connected = rc;
  743. out_noupdate:
  744. return rc;
  745. }
  746. /*
  747. * rpcrdma_ep_disconnect
  748. *
  749. * This is separate from destroy to facilitate the ability
  750. * to reconnect without recreating the endpoint.
  751. *
  752. * This call is not reentrant, and must not be made in parallel
  753. * on the same endpoint.
  754. */
  755. void
  756. rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  757. {
  758. int rc;
  759. rc = rdma_disconnect(ia->ri_id);
  760. if (!rc) {
  761. /* returns without wait if not connected */
  762. wait_event_interruptible(ep->rep_connect_wait,
  763. ep->rep_connected != 1);
  764. dprintk("RPC: %s: after wait, %sconnected\n", __func__,
  765. (ep->rep_connected == 1) ? "still " : "dis");
  766. } else {
  767. dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
  768. ep->rep_connected = rc;
  769. }
  770. ib_drain_qp(ia->ri_id->qp);
  771. }
  772. static void
  773. rpcrdma_mr_recovery_worker(struct work_struct *work)
  774. {
  775. struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
  776. rb_recovery_worker.work);
  777. struct rpcrdma_mw *mw;
  778. spin_lock(&buf->rb_recovery_lock);
  779. while (!list_empty(&buf->rb_stale_mrs)) {
  780. mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
  781. spin_unlock(&buf->rb_recovery_lock);
  782. dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
  783. mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
  784. spin_lock(&buf->rb_recovery_lock);
  785. }
  786. spin_unlock(&buf->rb_recovery_lock);
  787. }
  788. void
  789. rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
  790. {
  791. struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
  792. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  793. spin_lock(&buf->rb_recovery_lock);
  794. rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
  795. spin_unlock(&buf->rb_recovery_lock);
  796. schedule_delayed_work(&buf->rb_recovery_worker, 0);
  797. }
  798. static void
  799. rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
  800. {
  801. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  802. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  803. unsigned int count;
  804. LIST_HEAD(free);
  805. LIST_HEAD(all);
  806. for (count = 0; count < 32; count++) {
  807. struct rpcrdma_mw *mw;
  808. int rc;
  809. mw = kzalloc(sizeof(*mw), GFP_KERNEL);
  810. if (!mw)
  811. break;
  812. rc = ia->ri_ops->ro_init_mr(ia, mw);
  813. if (rc) {
  814. kfree(mw);
  815. break;
  816. }
  817. mw->mw_xprt = r_xprt;
  818. list_add(&mw->mw_list, &free);
  819. list_add(&mw->mw_all, &all);
  820. }
  821. spin_lock(&buf->rb_mwlock);
  822. list_splice(&free, &buf->rb_mws);
  823. list_splice(&all, &buf->rb_all);
  824. r_xprt->rx_stats.mrs_allocated += count;
  825. spin_unlock(&buf->rb_mwlock);
  826. dprintk("RPC: %s: created %u MRs\n", __func__, count);
  827. }
  828. static void
  829. rpcrdma_mr_refresh_worker(struct work_struct *work)
  830. {
  831. struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
  832. rb_refresh_worker.work);
  833. struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
  834. rx_buf);
  835. rpcrdma_create_mrs(r_xprt);
  836. }
  837. struct rpcrdma_req *
  838. rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
  839. {
  840. struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
  841. struct rpcrdma_req *req;
  842. req = kzalloc(sizeof(*req), GFP_KERNEL);
  843. if (req == NULL)
  844. return ERR_PTR(-ENOMEM);
  845. INIT_LIST_HEAD(&req->rl_free);
  846. spin_lock(&buffer->rb_reqslock);
  847. list_add(&req->rl_all, &buffer->rb_allreqs);
  848. spin_unlock(&buffer->rb_reqslock);
  849. req->rl_cqe.done = rpcrdma_wc_send;
  850. req->rl_buffer = &r_xprt->rx_buf;
  851. INIT_LIST_HEAD(&req->rl_registered);
  852. req->rl_send_wr.next = NULL;
  853. req->rl_send_wr.wr_cqe = &req->rl_cqe;
  854. req->rl_send_wr.sg_list = req->rl_send_sge;
  855. req->rl_send_wr.opcode = IB_WR_SEND;
  856. return req;
  857. }
  858. struct rpcrdma_rep *
  859. rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
  860. {
  861. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  862. struct rpcrdma_rep *rep;
  863. int rc;
  864. rc = -ENOMEM;
  865. rep = kzalloc(sizeof(*rep), GFP_KERNEL);
  866. if (rep == NULL)
  867. goto out;
  868. rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
  869. DMA_FROM_DEVICE, GFP_KERNEL);
  870. if (IS_ERR(rep->rr_rdmabuf)) {
  871. rc = PTR_ERR(rep->rr_rdmabuf);
  872. goto out_free;
  873. }
  874. rep->rr_cqe.done = rpcrdma_wc_receive;
  875. rep->rr_rxprt = r_xprt;
  876. INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
  877. rep->rr_recv_wr.next = NULL;
  878. rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
  879. rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
  880. rep->rr_recv_wr.num_sge = 1;
  881. return rep;
  882. out_free:
  883. kfree(rep);
  884. out:
  885. return ERR_PTR(rc);
  886. }
  887. int
  888. rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
  889. {
  890. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  891. int i, rc;
  892. buf->rb_max_requests = r_xprt->rx_data.max_requests;
  893. buf->rb_bc_srv_max_requests = 0;
  894. atomic_set(&buf->rb_credits, 1);
  895. spin_lock_init(&buf->rb_mwlock);
  896. spin_lock_init(&buf->rb_lock);
  897. spin_lock_init(&buf->rb_recovery_lock);
  898. INIT_LIST_HEAD(&buf->rb_mws);
  899. INIT_LIST_HEAD(&buf->rb_all);
  900. INIT_LIST_HEAD(&buf->rb_stale_mrs);
  901. INIT_DELAYED_WORK(&buf->rb_refresh_worker,
  902. rpcrdma_mr_refresh_worker);
  903. INIT_DELAYED_WORK(&buf->rb_recovery_worker,
  904. rpcrdma_mr_recovery_worker);
  905. rpcrdma_create_mrs(r_xprt);
  906. INIT_LIST_HEAD(&buf->rb_send_bufs);
  907. INIT_LIST_HEAD(&buf->rb_allreqs);
  908. spin_lock_init(&buf->rb_reqslock);
  909. for (i = 0; i < buf->rb_max_requests; i++) {
  910. struct rpcrdma_req *req;
  911. req = rpcrdma_create_req(r_xprt);
  912. if (IS_ERR(req)) {
  913. dprintk("RPC: %s: request buffer %d alloc"
  914. " failed\n", __func__, i);
  915. rc = PTR_ERR(req);
  916. goto out;
  917. }
  918. req->rl_backchannel = false;
  919. list_add(&req->rl_free, &buf->rb_send_bufs);
  920. }
  921. INIT_LIST_HEAD(&buf->rb_recv_bufs);
  922. for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) {
  923. struct rpcrdma_rep *rep;
  924. rep = rpcrdma_create_rep(r_xprt);
  925. if (IS_ERR(rep)) {
  926. dprintk("RPC: %s: reply buffer %d alloc failed\n",
  927. __func__, i);
  928. rc = PTR_ERR(rep);
  929. goto out;
  930. }
  931. list_add(&rep->rr_list, &buf->rb_recv_bufs);
  932. }
  933. return 0;
  934. out:
  935. rpcrdma_buffer_destroy(buf);
  936. return rc;
  937. }
  938. static struct rpcrdma_req *
  939. rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
  940. {
  941. struct rpcrdma_req *req;
  942. req = list_first_entry(&buf->rb_send_bufs,
  943. struct rpcrdma_req, rl_free);
  944. list_del(&req->rl_free);
  945. return req;
  946. }
  947. static struct rpcrdma_rep *
  948. rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
  949. {
  950. struct rpcrdma_rep *rep;
  951. rep = list_first_entry(&buf->rb_recv_bufs,
  952. struct rpcrdma_rep, rr_list);
  953. list_del(&rep->rr_list);
  954. return rep;
  955. }
  956. static void
  957. rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
  958. {
  959. rpcrdma_free_regbuf(rep->rr_rdmabuf);
  960. kfree(rep);
  961. }
  962. void
  963. rpcrdma_destroy_req(struct rpcrdma_req *req)
  964. {
  965. rpcrdma_free_regbuf(req->rl_recvbuf);
  966. rpcrdma_free_regbuf(req->rl_sendbuf);
  967. rpcrdma_free_regbuf(req->rl_rdmabuf);
  968. kfree(req);
  969. }
  970. static void
  971. rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
  972. {
  973. struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
  974. rx_buf);
  975. struct rpcrdma_ia *ia = rdmab_to_ia(buf);
  976. struct rpcrdma_mw *mw;
  977. unsigned int count;
  978. count = 0;
  979. spin_lock(&buf->rb_mwlock);
  980. while (!list_empty(&buf->rb_all)) {
  981. mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
  982. list_del(&mw->mw_all);
  983. spin_unlock(&buf->rb_mwlock);
  984. ia->ri_ops->ro_release_mr(mw);
  985. count++;
  986. spin_lock(&buf->rb_mwlock);
  987. }
  988. spin_unlock(&buf->rb_mwlock);
  989. r_xprt->rx_stats.mrs_allocated = 0;
  990. dprintk("RPC: %s: released %u MRs\n", __func__, count);
  991. }
  992. void
  993. rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  994. {
  995. cancel_delayed_work_sync(&buf->rb_recovery_worker);
  996. cancel_delayed_work_sync(&buf->rb_refresh_worker);
  997. while (!list_empty(&buf->rb_recv_bufs)) {
  998. struct rpcrdma_rep *rep;
  999. rep = rpcrdma_buffer_get_rep_locked(buf);
  1000. rpcrdma_destroy_rep(rep);
  1001. }
  1002. buf->rb_send_count = 0;
  1003. spin_lock(&buf->rb_reqslock);
  1004. while (!list_empty(&buf->rb_allreqs)) {
  1005. struct rpcrdma_req *req;
  1006. req = list_first_entry(&buf->rb_allreqs,
  1007. struct rpcrdma_req, rl_all);
  1008. list_del(&req->rl_all);
  1009. spin_unlock(&buf->rb_reqslock);
  1010. rpcrdma_destroy_req(req);
  1011. spin_lock(&buf->rb_reqslock);
  1012. }
  1013. spin_unlock(&buf->rb_reqslock);
  1014. buf->rb_recv_count = 0;
  1015. rpcrdma_destroy_mrs(buf);
  1016. }
  1017. struct rpcrdma_mw *
  1018. rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
  1019. {
  1020. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  1021. struct rpcrdma_mw *mw = NULL;
  1022. spin_lock(&buf->rb_mwlock);
  1023. if (!list_empty(&buf->rb_mws))
  1024. mw = rpcrdma_pop_mw(&buf->rb_mws);
  1025. spin_unlock(&buf->rb_mwlock);
  1026. if (!mw)
  1027. goto out_nomws;
  1028. mw->mw_flags = 0;
  1029. return mw;
  1030. out_nomws:
  1031. dprintk("RPC: %s: no MWs available\n", __func__);
  1032. if (r_xprt->rx_ep.rep_connected != -ENODEV)
  1033. schedule_delayed_work(&buf->rb_refresh_worker, 0);
  1034. /* Allow the reply handler and refresh worker to run */
  1035. cond_resched();
  1036. return NULL;
  1037. }
  1038. void
  1039. rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
  1040. {
  1041. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  1042. spin_lock(&buf->rb_mwlock);
  1043. rpcrdma_push_mw(mw, &buf->rb_mws);
  1044. spin_unlock(&buf->rb_mwlock);
  1045. }
  1046. static struct rpcrdma_rep *
  1047. rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
  1048. {
  1049. /* If an RPC previously completed without a reply (say, a
  1050. * credential problem or a soft timeout occurs) then hold off
  1051. * on supplying more Receive buffers until the number of new
  1052. * pending RPCs catches up to the number of posted Receives.
  1053. */
  1054. if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
  1055. return NULL;
  1056. if (unlikely(list_empty(&buffers->rb_recv_bufs)))
  1057. return NULL;
  1058. buffers->rb_recv_count++;
  1059. return rpcrdma_buffer_get_rep_locked(buffers);
  1060. }
  1061. /*
  1062. * Get a set of request/reply buffers.
  1063. *
  1064. * Reply buffer (if available) is attached to send buffer upon return.
  1065. */
  1066. struct rpcrdma_req *
  1067. rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  1068. {
  1069. struct rpcrdma_req *req;
  1070. spin_lock(&buffers->rb_lock);
  1071. if (list_empty(&buffers->rb_send_bufs))
  1072. goto out_reqbuf;
  1073. buffers->rb_send_count++;
  1074. req = rpcrdma_buffer_get_req_locked(buffers);
  1075. req->rl_reply = rpcrdma_buffer_get_rep(buffers);
  1076. spin_unlock(&buffers->rb_lock);
  1077. return req;
  1078. out_reqbuf:
  1079. spin_unlock(&buffers->rb_lock);
  1080. pr_warn("RPC: %s: out of request buffers\n", __func__);
  1081. return NULL;
  1082. }
  1083. /*
  1084. * Put request/reply buffers back into pool.
  1085. * Pre-decrement counter/array index.
  1086. */
  1087. void
  1088. rpcrdma_buffer_put(struct rpcrdma_req *req)
  1089. {
  1090. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1091. struct rpcrdma_rep *rep = req->rl_reply;
  1092. req->rl_send_wr.num_sge = 0;
  1093. req->rl_reply = NULL;
  1094. spin_lock(&buffers->rb_lock);
  1095. buffers->rb_send_count--;
  1096. list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
  1097. if (rep) {
  1098. buffers->rb_recv_count--;
  1099. list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
  1100. }
  1101. spin_unlock(&buffers->rb_lock);
  1102. }
  1103. /*
  1104. * Recover reply buffers from pool.
  1105. * This happens when recovering from disconnect.
  1106. */
  1107. void
  1108. rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
  1109. {
  1110. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1111. spin_lock(&buffers->rb_lock);
  1112. req->rl_reply = rpcrdma_buffer_get_rep(buffers);
  1113. spin_unlock(&buffers->rb_lock);
  1114. }
  1115. /*
  1116. * Put reply buffers back into pool when not attached to
  1117. * request. This happens in error conditions.
  1118. */
  1119. void
  1120. rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  1121. {
  1122. struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
  1123. spin_lock(&buffers->rb_lock);
  1124. buffers->rb_recv_count--;
  1125. list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
  1126. spin_unlock(&buffers->rb_lock);
  1127. }
  1128. /**
  1129. * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
  1130. * @size: size of buffer to be allocated, in bytes
  1131. * @direction: direction of data movement
  1132. * @flags: GFP flags
  1133. *
  1134. * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
  1135. * can be persistently DMA-mapped for I/O.
  1136. *
  1137. * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
  1138. * receiving the payload of RDMA RECV operations. During Long Calls
  1139. * or Replies they may be registered externally via ro_map.
  1140. */
  1141. struct rpcrdma_regbuf *
  1142. rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
  1143. gfp_t flags)
  1144. {
  1145. struct rpcrdma_regbuf *rb;
  1146. rb = kmalloc(sizeof(*rb) + size, flags);
  1147. if (rb == NULL)
  1148. return ERR_PTR(-ENOMEM);
  1149. rb->rg_device = NULL;
  1150. rb->rg_direction = direction;
  1151. rb->rg_iov.length = size;
  1152. return rb;
  1153. }
  1154. /**
  1155. * __rpcrdma_map_regbuf - DMA-map a regbuf
  1156. * @ia: controlling rpcrdma_ia
  1157. * @rb: regbuf to be mapped
  1158. */
  1159. bool
  1160. __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
  1161. {
  1162. struct ib_device *device = ia->ri_device;
  1163. if (rb->rg_direction == DMA_NONE)
  1164. return false;
  1165. rb->rg_iov.addr = ib_dma_map_single(device,
  1166. (void *)rb->rg_base,
  1167. rdmab_length(rb),
  1168. rb->rg_direction);
  1169. if (ib_dma_mapping_error(device, rdmab_addr(rb)))
  1170. return false;
  1171. rb->rg_device = device;
  1172. rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
  1173. return true;
  1174. }
  1175. static void
  1176. rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
  1177. {
  1178. if (!rpcrdma_regbuf_is_mapped(rb))
  1179. return;
  1180. ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
  1181. rdmab_length(rb), rb->rg_direction);
  1182. rb->rg_device = NULL;
  1183. }
  1184. /**
  1185. * rpcrdma_free_regbuf - deregister and free registered buffer
  1186. * @rb: regbuf to be deregistered and freed
  1187. */
  1188. void
  1189. rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
  1190. {
  1191. if (!rb)
  1192. return;
  1193. rpcrdma_dma_unmap_regbuf(rb);
  1194. kfree(rb);
  1195. }
  1196. /*
  1197. * Prepost any receive buffer, then post send.
  1198. *
  1199. * Receive buffer is donated to hardware, reclaimed upon recv completion.
  1200. */
  1201. int
  1202. rpcrdma_ep_post(struct rpcrdma_ia *ia,
  1203. struct rpcrdma_ep *ep,
  1204. struct rpcrdma_req *req)
  1205. {
  1206. struct ib_send_wr *send_wr = &req->rl_send_wr;
  1207. struct ib_send_wr *send_wr_fail;
  1208. int rc;
  1209. if (req->rl_reply) {
  1210. rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
  1211. if (rc)
  1212. return rc;
  1213. req->rl_reply = NULL;
  1214. }
  1215. dprintk("RPC: %s: posting %d s/g entries\n",
  1216. __func__, send_wr->num_sge);
  1217. rpcrdma_set_signaled(ep, send_wr);
  1218. rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
  1219. if (rc)
  1220. goto out_postsend_err;
  1221. return 0;
  1222. out_postsend_err:
  1223. pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
  1224. return -ENOTCONN;
  1225. }
  1226. int
  1227. rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
  1228. struct rpcrdma_rep *rep)
  1229. {
  1230. struct ib_recv_wr *recv_wr_fail;
  1231. int rc;
  1232. if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
  1233. goto out_map;
  1234. rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
  1235. if (rc)
  1236. goto out_postrecv;
  1237. return 0;
  1238. out_map:
  1239. pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
  1240. return -EIO;
  1241. out_postrecv:
  1242. pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
  1243. return -ENOTCONN;
  1244. }
  1245. /**
  1246. * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
  1247. * @r_xprt: transport associated with these backchannel resources
  1248. * @min_reqs: minimum number of incoming requests expected
  1249. *
  1250. * Returns zero if all requested buffers were posted, or a negative errno.
  1251. */
  1252. int
  1253. rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
  1254. {
  1255. struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
  1256. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  1257. struct rpcrdma_rep *rep;
  1258. int rc;
  1259. while (count--) {
  1260. spin_lock(&buffers->rb_lock);
  1261. if (list_empty(&buffers->rb_recv_bufs))
  1262. goto out_reqbuf;
  1263. rep = rpcrdma_buffer_get_rep_locked(buffers);
  1264. spin_unlock(&buffers->rb_lock);
  1265. rc = rpcrdma_ep_post_recv(ia, rep);
  1266. if (rc)
  1267. goto out_rc;
  1268. }
  1269. return 0;
  1270. out_reqbuf:
  1271. spin_unlock(&buffers->rb_lock);
  1272. pr_warn("%s: no extra receive buffers\n", __func__);
  1273. return -ENOMEM;
  1274. out_rc:
  1275. rpcrdma_recv_buffer_put(rep);
  1276. return rc;
  1277. }