verbs.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343
  1. /*
  2. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the BSD-type
  8. * license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * Redistributions in binary form must reproduce the above
  18. * copyright notice, this list of conditions and the following
  19. * disclaimer in the documentation and/or other materials provided
  20. * with the distribution.
  21. *
  22. * Neither the name of the Network Appliance, Inc. nor the names of
  23. * its contributors may be used to endorse or promote products
  24. * derived from this software without specific prior written
  25. * permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. /*
  40. * verbs.c
  41. *
  42. * Encapsulates the major functions managing:
  43. * o adapters
  44. * o endpoints
  45. * o connections
  46. * o buffer memory
  47. */
  48. #include <linux/interrupt.h>
  49. #include <linux/slab.h>
  50. #include <linux/prefetch.h>
  51. #include <linux/sunrpc/addr.h>
  52. #include <asm/bitops.h>
  53. #include <linux/module.h> /* try_module_get()/module_put() */
  54. #include "xprt_rdma.h"
  55. /*
  56. * Globals/Macros
  57. */
  58. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  59. # define RPCDBG_FACILITY RPCDBG_TRANS
  60. #endif
  61. /*
  62. * internal functions
  63. */
  64. static struct workqueue_struct *rpcrdma_receive_wq;
  65. int
  66. rpcrdma_alloc_wq(void)
  67. {
  68. struct workqueue_struct *recv_wq;
  69. recv_wq = alloc_workqueue("xprtrdma_receive",
  70. WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
  71. 0);
  72. if (!recv_wq)
  73. return -ENOMEM;
  74. rpcrdma_receive_wq = recv_wq;
  75. return 0;
  76. }
  77. void
  78. rpcrdma_destroy_wq(void)
  79. {
  80. struct workqueue_struct *wq;
  81. if (rpcrdma_receive_wq) {
  82. wq = rpcrdma_receive_wq;
  83. rpcrdma_receive_wq = NULL;
  84. destroy_workqueue(wq);
  85. }
  86. }
  87. static void
  88. rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
  89. {
  90. struct rpcrdma_ep *ep = context;
  91. pr_err("RPC: %s: %s on device %s ep %p\n",
  92. __func__, ib_event_msg(event->event),
  93. event->device->name, context);
  94. if (ep->rep_connected == 1) {
  95. ep->rep_connected = -EIO;
  96. rpcrdma_conn_func(ep);
  97. wake_up_all(&ep->rep_connect_wait);
  98. }
  99. }
  100. /**
  101. * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
  102. * @cq: completion queue (ignored)
  103. * @wc: completed WR
  104. *
  105. */
  106. static void
  107. rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
  108. {
  109. /* WARNING: Only wr_cqe and status are reliable at this point */
  110. if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
  111. pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
  112. ib_wc_status_msg(wc->status),
  113. wc->status, wc->vendor_err);
  114. }
  115. static void
  116. rpcrdma_receive_worker(struct work_struct *work)
  117. {
  118. struct rpcrdma_rep *rep =
  119. container_of(work, struct rpcrdma_rep, rr_work);
  120. rpcrdma_reply_handler(rep);
  121. }
  122. /* Perform basic sanity checking to avoid using garbage
  123. * to update the credit grant value.
  124. */
  125. static void
  126. rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
  127. {
  128. struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf);
  129. struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
  130. u32 credits;
  131. if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
  132. return;
  133. credits = be32_to_cpu(rmsgp->rm_credit);
  134. if (credits == 0)
  135. credits = 1; /* don't deadlock */
  136. else if (credits > buffer->rb_max_requests)
  137. credits = buffer->rb_max_requests;
  138. atomic_set(&buffer->rb_credits, credits);
  139. }
  140. /**
  141. * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
  142. * @cq: completion queue (ignored)
  143. * @wc: completed WR
  144. *
  145. */
  146. static void
  147. rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
  148. {
  149. struct ib_cqe *cqe = wc->wr_cqe;
  150. struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
  151. rr_cqe);
  152. /* WARNING: Only wr_id and status are reliable at this point */
  153. if (wc->status != IB_WC_SUCCESS)
  154. goto out_fail;
  155. /* status == SUCCESS means all fields in wc are trustworthy */
  156. if (wc->opcode != IB_WC_RECV)
  157. return;
  158. dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
  159. __func__, rep, wc->byte_len);
  160. rep->rr_len = wc->byte_len;
  161. ib_dma_sync_single_for_cpu(rep->rr_device,
  162. rdmab_addr(rep->rr_rdmabuf),
  163. rep->rr_len, DMA_FROM_DEVICE);
  164. rpcrdma_update_granted_credits(rep);
  165. out_schedule:
  166. queue_work(rpcrdma_receive_wq, &rep->rr_work);
  167. return;
  168. out_fail:
  169. if (wc->status != IB_WC_WR_FLUSH_ERR)
  170. pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
  171. ib_wc_status_msg(wc->status),
  172. wc->status, wc->vendor_err);
  173. rep->rr_len = RPCRDMA_BAD_LEN;
  174. goto out_schedule;
  175. }
  176. static int
  177. rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
  178. {
  179. struct rpcrdma_xprt *xprt = id->context;
  180. struct rpcrdma_ia *ia = &xprt->rx_ia;
  181. struct rpcrdma_ep *ep = &xprt->rx_ep;
  182. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  183. struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
  184. #endif
  185. struct ib_qp_attr *attr = &ia->ri_qp_attr;
  186. struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
  187. int connstate = 0;
  188. switch (event->event) {
  189. case RDMA_CM_EVENT_ADDR_RESOLVED:
  190. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  191. ia->ri_async_rc = 0;
  192. complete(&ia->ri_done);
  193. break;
  194. case RDMA_CM_EVENT_ADDR_ERROR:
  195. ia->ri_async_rc = -EHOSTUNREACH;
  196. dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
  197. __func__, ep);
  198. complete(&ia->ri_done);
  199. break;
  200. case RDMA_CM_EVENT_ROUTE_ERROR:
  201. ia->ri_async_rc = -ENETUNREACH;
  202. dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
  203. __func__, ep);
  204. complete(&ia->ri_done);
  205. break;
  206. case RDMA_CM_EVENT_ESTABLISHED:
  207. connstate = 1;
  208. ib_query_qp(ia->ri_id->qp, attr,
  209. IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
  210. iattr);
  211. dprintk("RPC: %s: %d responder resources"
  212. " (%d initiator)\n",
  213. __func__, attr->max_dest_rd_atomic,
  214. attr->max_rd_atomic);
  215. goto connected;
  216. case RDMA_CM_EVENT_CONNECT_ERROR:
  217. connstate = -ENOTCONN;
  218. goto connected;
  219. case RDMA_CM_EVENT_UNREACHABLE:
  220. connstate = -ENETDOWN;
  221. goto connected;
  222. case RDMA_CM_EVENT_REJECTED:
  223. connstate = -ECONNREFUSED;
  224. goto connected;
  225. case RDMA_CM_EVENT_DISCONNECTED:
  226. connstate = -ECONNABORTED;
  227. goto connected;
  228. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  229. connstate = -ENODEV;
  230. connected:
  231. dprintk("RPC: %s: %sconnected\n",
  232. __func__, connstate > 0 ? "" : "dis");
  233. atomic_set(&xprt->rx_buf.rb_credits, 1);
  234. ep->rep_connected = connstate;
  235. rpcrdma_conn_func(ep);
  236. wake_up_all(&ep->rep_connect_wait);
  237. /*FALLTHROUGH*/
  238. default:
  239. dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
  240. __func__, sap, rpc_get_port(sap), ep,
  241. rdma_event_msg(event->event));
  242. break;
  243. }
  244. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  245. if (connstate == 1) {
  246. int ird = attr->max_dest_rd_atomic;
  247. int tird = ep->rep_remote_cma.responder_resources;
  248. pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
  249. sap, rpc_get_port(sap),
  250. ia->ri_device->name,
  251. ia->ri_ops->ro_displayname,
  252. xprt->rx_buf.rb_max_requests,
  253. ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
  254. } else if (connstate < 0) {
  255. pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
  256. sap, rpc_get_port(sap), connstate);
  257. }
  258. #endif
  259. return 0;
  260. }
  261. static void rpcrdma_destroy_id(struct rdma_cm_id *id)
  262. {
  263. if (id) {
  264. module_put(id->device->owner);
  265. rdma_destroy_id(id);
  266. }
  267. }
  268. static struct rdma_cm_id *
  269. rpcrdma_create_id(struct rpcrdma_xprt *xprt,
  270. struct rpcrdma_ia *ia, struct sockaddr *addr)
  271. {
  272. struct rdma_cm_id *id;
  273. int rc;
  274. init_completion(&ia->ri_done);
  275. id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
  276. IB_QPT_RC);
  277. if (IS_ERR(id)) {
  278. rc = PTR_ERR(id);
  279. dprintk("RPC: %s: rdma_create_id() failed %i\n",
  280. __func__, rc);
  281. return id;
  282. }
  283. ia->ri_async_rc = -ETIMEDOUT;
  284. rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
  285. if (rc) {
  286. dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
  287. __func__, rc);
  288. goto out;
  289. }
  290. wait_for_completion_interruptible_timeout(&ia->ri_done,
  291. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  292. /* FIXME:
  293. * Until xprtrdma supports DEVICE_REMOVAL, the provider must
  294. * be pinned while there are active NFS/RDMA mounts to prevent
  295. * hangs and crashes at umount time.
  296. */
  297. if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
  298. dprintk("RPC: %s: Failed to get device module\n",
  299. __func__);
  300. ia->ri_async_rc = -ENODEV;
  301. }
  302. rc = ia->ri_async_rc;
  303. if (rc)
  304. goto out;
  305. ia->ri_async_rc = -ETIMEDOUT;
  306. rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
  307. if (rc) {
  308. dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
  309. __func__, rc);
  310. goto put;
  311. }
  312. wait_for_completion_interruptible_timeout(&ia->ri_done,
  313. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  314. rc = ia->ri_async_rc;
  315. if (rc)
  316. goto put;
  317. return id;
  318. put:
  319. module_put(id->device->owner);
  320. out:
  321. rdma_destroy_id(id);
  322. return ERR_PTR(rc);
  323. }
  324. /*
  325. * Exported functions.
  326. */
  327. /*
  328. * Open and initialize an Interface Adapter.
  329. * o initializes fields of struct rpcrdma_ia, including
  330. * interface and provider attributes and protection zone.
  331. */
  332. int
  333. rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  334. {
  335. struct rpcrdma_ia *ia = &xprt->rx_ia;
  336. int rc;
  337. ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
  338. if (IS_ERR(ia->ri_id)) {
  339. rc = PTR_ERR(ia->ri_id);
  340. goto out1;
  341. }
  342. ia->ri_device = ia->ri_id->device;
  343. ia->ri_pd = ib_alloc_pd(ia->ri_device);
  344. if (IS_ERR(ia->ri_pd)) {
  345. rc = PTR_ERR(ia->ri_pd);
  346. pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
  347. goto out2;
  348. }
  349. switch (memreg) {
  350. case RPCRDMA_FRMR:
  351. if (frwr_is_supported(ia)) {
  352. ia->ri_ops = &rpcrdma_frwr_memreg_ops;
  353. break;
  354. }
  355. /*FALLTHROUGH*/
  356. case RPCRDMA_MTHCAFMR:
  357. if (fmr_is_supported(ia)) {
  358. ia->ri_ops = &rpcrdma_fmr_memreg_ops;
  359. break;
  360. }
  361. /*FALLTHROUGH*/
  362. default:
  363. pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
  364. memreg);
  365. rc = -EINVAL;
  366. goto out3;
  367. }
  368. return 0;
  369. out3:
  370. ib_dealloc_pd(ia->ri_pd);
  371. ia->ri_pd = NULL;
  372. out2:
  373. rpcrdma_destroy_id(ia->ri_id);
  374. ia->ri_id = NULL;
  375. out1:
  376. return rc;
  377. }
  378. /*
  379. * Clean up/close an IA.
  380. * o if event handles and PD have been initialized, free them.
  381. * o close the IA
  382. */
  383. void
  384. rpcrdma_ia_close(struct rpcrdma_ia *ia)
  385. {
  386. dprintk("RPC: %s: entering\n", __func__);
  387. if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
  388. if (ia->ri_id->qp)
  389. rdma_destroy_qp(ia->ri_id);
  390. rpcrdma_destroy_id(ia->ri_id);
  391. ia->ri_id = NULL;
  392. }
  393. /* If the pd is still busy, xprtrdma missed freeing a resource */
  394. if (ia->ri_pd && !IS_ERR(ia->ri_pd))
  395. ib_dealloc_pd(ia->ri_pd);
  396. }
  397. /*
  398. * Create unconnected endpoint.
  399. */
  400. int
  401. rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  402. struct rpcrdma_create_data_internal *cdata)
  403. {
  404. struct ib_cq *sendcq, *recvcq;
  405. unsigned int max_qp_wr;
  406. int rc;
  407. if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
  408. dprintk("RPC: %s: insufficient sge's available\n",
  409. __func__);
  410. return -ENOMEM;
  411. }
  412. if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
  413. dprintk("RPC: %s: insufficient wqe's available\n",
  414. __func__);
  415. return -ENOMEM;
  416. }
  417. max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
  418. /* check provider's send/recv wr limits */
  419. if (cdata->max_requests > max_qp_wr)
  420. cdata->max_requests = max_qp_wr;
  421. ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
  422. ep->rep_attr.qp_context = ep;
  423. ep->rep_attr.srq = NULL;
  424. ep->rep_attr.cap.max_send_wr = cdata->max_requests;
  425. ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
  426. ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
  427. rc = ia->ri_ops->ro_open(ia, ep, cdata);
  428. if (rc)
  429. return rc;
  430. ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
  431. ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
  432. ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
  433. ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
  434. ep->rep_attr.cap.max_recv_sge = 1;
  435. ep->rep_attr.cap.max_inline_data = 0;
  436. ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  437. ep->rep_attr.qp_type = IB_QPT_RC;
  438. ep->rep_attr.port_num = ~0;
  439. dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
  440. "iovs: send %d recv %d\n",
  441. __func__,
  442. ep->rep_attr.cap.max_send_wr,
  443. ep->rep_attr.cap.max_recv_wr,
  444. ep->rep_attr.cap.max_send_sge,
  445. ep->rep_attr.cap.max_recv_sge);
  446. /* set trigger for requesting send completion */
  447. ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
  448. if (ep->rep_cqinit <= 2)
  449. ep->rep_cqinit = 0; /* always signal? */
  450. INIT_CQCOUNT(ep);
  451. init_waitqueue_head(&ep->rep_connect_wait);
  452. INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
  453. sendcq = ib_alloc_cq(ia->ri_device, NULL,
  454. ep->rep_attr.cap.max_send_wr + 1,
  455. 0, IB_POLL_SOFTIRQ);
  456. if (IS_ERR(sendcq)) {
  457. rc = PTR_ERR(sendcq);
  458. dprintk("RPC: %s: failed to create send CQ: %i\n",
  459. __func__, rc);
  460. goto out1;
  461. }
  462. recvcq = ib_alloc_cq(ia->ri_device, NULL,
  463. ep->rep_attr.cap.max_recv_wr + 1,
  464. 0, IB_POLL_SOFTIRQ);
  465. if (IS_ERR(recvcq)) {
  466. rc = PTR_ERR(recvcq);
  467. dprintk("RPC: %s: failed to create recv CQ: %i\n",
  468. __func__, rc);
  469. goto out2;
  470. }
  471. ep->rep_attr.send_cq = sendcq;
  472. ep->rep_attr.recv_cq = recvcq;
  473. /* Initialize cma parameters */
  474. memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
  475. /* RPC/RDMA does not use private data */
  476. ep->rep_remote_cma.private_data = NULL;
  477. ep->rep_remote_cma.private_data_len = 0;
  478. /* Client offers RDMA Read but does not initiate */
  479. ep->rep_remote_cma.initiator_depth = 0;
  480. if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
  481. ep->rep_remote_cma.responder_resources = 32;
  482. else
  483. ep->rep_remote_cma.responder_resources =
  484. ia->ri_device->attrs.max_qp_rd_atom;
  485. /* Limit transport retries so client can detect server
  486. * GID changes quickly. RPC layer handles re-establishing
  487. * transport connection and retransmission.
  488. */
  489. ep->rep_remote_cma.retry_count = 6;
  490. /* RPC-over-RDMA handles its own flow control. In addition,
  491. * make all RNR NAKs visible so we know that RPC-over-RDMA
  492. * flow control is working correctly (no NAKs should be seen).
  493. */
  494. ep->rep_remote_cma.flow_control = 0;
  495. ep->rep_remote_cma.rnr_retry_count = 0;
  496. return 0;
  497. out2:
  498. ib_free_cq(sendcq);
  499. out1:
  500. return rc;
  501. }
  502. /*
  503. * rpcrdma_ep_destroy
  504. *
  505. * Disconnect and destroy endpoint. After this, the only
  506. * valid operations on the ep are to free it (if dynamically
  507. * allocated) or re-create it.
  508. */
  509. void
  510. rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  511. {
  512. dprintk("RPC: %s: entering, connected is %d\n",
  513. __func__, ep->rep_connected);
  514. cancel_delayed_work_sync(&ep->rep_connect_worker);
  515. if (ia->ri_id->qp) {
  516. rpcrdma_ep_disconnect(ep, ia);
  517. rdma_destroy_qp(ia->ri_id);
  518. ia->ri_id->qp = NULL;
  519. }
  520. ib_free_cq(ep->rep_attr.recv_cq);
  521. ib_free_cq(ep->rep_attr.send_cq);
  522. }
  523. /*
  524. * Connect unconnected endpoint.
  525. */
  526. int
  527. rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  528. {
  529. struct rdma_cm_id *id, *old;
  530. int rc = 0;
  531. int retry_count = 0;
  532. if (ep->rep_connected != 0) {
  533. struct rpcrdma_xprt *xprt;
  534. retry:
  535. dprintk("RPC: %s: reconnecting...\n", __func__);
  536. rpcrdma_ep_disconnect(ep, ia);
  537. xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
  538. id = rpcrdma_create_id(xprt, ia,
  539. (struct sockaddr *)&xprt->rx_data.addr);
  540. if (IS_ERR(id)) {
  541. rc = -EHOSTUNREACH;
  542. goto out;
  543. }
  544. /* TEMP TEMP TEMP - fail if new device:
  545. * Deregister/remarshal *all* requests!
  546. * Close and recreate adapter, pd, etc!
  547. * Re-determine all attributes still sane!
  548. * More stuff I haven't thought of!
  549. * Rrrgh!
  550. */
  551. if (ia->ri_device != id->device) {
  552. printk("RPC: %s: can't reconnect on "
  553. "different device!\n", __func__);
  554. rpcrdma_destroy_id(id);
  555. rc = -ENETUNREACH;
  556. goto out;
  557. }
  558. /* END TEMP */
  559. rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
  560. if (rc) {
  561. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  562. __func__, rc);
  563. rpcrdma_destroy_id(id);
  564. rc = -ENETUNREACH;
  565. goto out;
  566. }
  567. old = ia->ri_id;
  568. ia->ri_id = id;
  569. rdma_destroy_qp(old);
  570. rpcrdma_destroy_id(old);
  571. } else {
  572. dprintk("RPC: %s: connecting...\n", __func__);
  573. rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
  574. if (rc) {
  575. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  576. __func__, rc);
  577. /* do not update ep->rep_connected */
  578. return -ENETUNREACH;
  579. }
  580. }
  581. ep->rep_connected = 0;
  582. rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
  583. if (rc) {
  584. dprintk("RPC: %s: rdma_connect() failed with %i\n",
  585. __func__, rc);
  586. goto out;
  587. }
  588. wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
  589. /*
  590. * Check state. A non-peer reject indicates no listener
  591. * (ECONNREFUSED), which may be a transient state. All
  592. * others indicate a transport condition which has already
  593. * undergone a best-effort.
  594. */
  595. if (ep->rep_connected == -ECONNREFUSED &&
  596. ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
  597. dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
  598. goto retry;
  599. }
  600. if (ep->rep_connected <= 0) {
  601. /* Sometimes, the only way to reliably connect to remote
  602. * CMs is to use same nonzero values for ORD and IRD. */
  603. if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
  604. (ep->rep_remote_cma.responder_resources == 0 ||
  605. ep->rep_remote_cma.initiator_depth !=
  606. ep->rep_remote_cma.responder_resources)) {
  607. if (ep->rep_remote_cma.responder_resources == 0)
  608. ep->rep_remote_cma.responder_resources = 1;
  609. ep->rep_remote_cma.initiator_depth =
  610. ep->rep_remote_cma.responder_resources;
  611. goto retry;
  612. }
  613. rc = ep->rep_connected;
  614. } else {
  615. struct rpcrdma_xprt *r_xprt;
  616. unsigned int extras;
  617. dprintk("RPC: %s: connected\n", __func__);
  618. r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
  619. extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
  620. if (extras) {
  621. rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
  622. if (rc) {
  623. pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
  624. __func__, rc);
  625. rc = 0;
  626. }
  627. }
  628. }
  629. out:
  630. if (rc)
  631. ep->rep_connected = rc;
  632. return rc;
  633. }
  634. /*
  635. * rpcrdma_ep_disconnect
  636. *
  637. * This is separate from destroy to facilitate the ability
  638. * to reconnect without recreating the endpoint.
  639. *
  640. * This call is not reentrant, and must not be made in parallel
  641. * on the same endpoint.
  642. */
  643. void
  644. rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  645. {
  646. int rc;
  647. rc = rdma_disconnect(ia->ri_id);
  648. if (!rc) {
  649. /* returns without wait if not connected */
  650. wait_event_interruptible(ep->rep_connect_wait,
  651. ep->rep_connected != 1);
  652. dprintk("RPC: %s: after wait, %sconnected\n", __func__,
  653. (ep->rep_connected == 1) ? "still " : "dis");
  654. } else {
  655. dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
  656. ep->rep_connected = rc;
  657. }
  658. ib_drain_qp(ia->ri_id->qp);
  659. }
  660. static void
  661. rpcrdma_mr_recovery_worker(struct work_struct *work)
  662. {
  663. struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
  664. rb_recovery_worker.work);
  665. struct rpcrdma_mw *mw;
  666. spin_lock(&buf->rb_recovery_lock);
  667. while (!list_empty(&buf->rb_stale_mrs)) {
  668. mw = list_first_entry(&buf->rb_stale_mrs,
  669. struct rpcrdma_mw, mw_list);
  670. list_del_init(&mw->mw_list);
  671. spin_unlock(&buf->rb_recovery_lock);
  672. dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
  673. mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
  674. spin_lock(&buf->rb_recovery_lock);
  675. }
  676. spin_unlock(&buf->rb_recovery_lock);
  677. }
  678. void
  679. rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
  680. {
  681. struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
  682. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  683. spin_lock(&buf->rb_recovery_lock);
  684. list_add(&mw->mw_list, &buf->rb_stale_mrs);
  685. spin_unlock(&buf->rb_recovery_lock);
  686. schedule_delayed_work(&buf->rb_recovery_worker, 0);
  687. }
  688. static void
  689. rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
  690. {
  691. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  692. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  693. unsigned int count;
  694. LIST_HEAD(free);
  695. LIST_HEAD(all);
  696. for (count = 0; count < 32; count++) {
  697. struct rpcrdma_mw *mw;
  698. int rc;
  699. mw = kzalloc(sizeof(*mw), GFP_KERNEL);
  700. if (!mw)
  701. break;
  702. rc = ia->ri_ops->ro_init_mr(ia, mw);
  703. if (rc) {
  704. kfree(mw);
  705. break;
  706. }
  707. mw->mw_xprt = r_xprt;
  708. list_add(&mw->mw_list, &free);
  709. list_add(&mw->mw_all, &all);
  710. }
  711. spin_lock(&buf->rb_mwlock);
  712. list_splice(&free, &buf->rb_mws);
  713. list_splice(&all, &buf->rb_all);
  714. r_xprt->rx_stats.mrs_allocated += count;
  715. spin_unlock(&buf->rb_mwlock);
  716. dprintk("RPC: %s: created %u MRs\n", __func__, count);
  717. }
  718. static void
  719. rpcrdma_mr_refresh_worker(struct work_struct *work)
  720. {
  721. struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
  722. rb_refresh_worker.work);
  723. struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
  724. rx_buf);
  725. rpcrdma_create_mrs(r_xprt);
  726. }
  727. struct rpcrdma_req *
  728. rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
  729. {
  730. struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
  731. struct rpcrdma_req *req;
  732. req = kzalloc(sizeof(*req), GFP_KERNEL);
  733. if (req == NULL)
  734. return ERR_PTR(-ENOMEM);
  735. INIT_LIST_HEAD(&req->rl_free);
  736. spin_lock(&buffer->rb_reqslock);
  737. list_add(&req->rl_all, &buffer->rb_allreqs);
  738. spin_unlock(&buffer->rb_reqslock);
  739. req->rl_cqe.done = rpcrdma_wc_send;
  740. req->rl_buffer = &r_xprt->rx_buf;
  741. INIT_LIST_HEAD(&req->rl_registered);
  742. return req;
  743. }
  744. struct rpcrdma_rep *
  745. rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
  746. {
  747. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  748. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  749. struct rpcrdma_rep *rep;
  750. int rc;
  751. rc = -ENOMEM;
  752. rep = kzalloc(sizeof(*rep), GFP_KERNEL);
  753. if (rep == NULL)
  754. goto out;
  755. rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
  756. GFP_KERNEL);
  757. if (IS_ERR(rep->rr_rdmabuf)) {
  758. rc = PTR_ERR(rep->rr_rdmabuf);
  759. goto out_free;
  760. }
  761. rep->rr_device = ia->ri_device;
  762. rep->rr_cqe.done = rpcrdma_receive_wc;
  763. rep->rr_rxprt = r_xprt;
  764. INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
  765. return rep;
  766. out_free:
  767. kfree(rep);
  768. out:
  769. return ERR_PTR(rc);
  770. }
  771. int
  772. rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
  773. {
  774. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  775. int i, rc;
  776. buf->rb_max_requests = r_xprt->rx_data.max_requests;
  777. buf->rb_bc_srv_max_requests = 0;
  778. atomic_set(&buf->rb_credits, 1);
  779. spin_lock_init(&buf->rb_mwlock);
  780. spin_lock_init(&buf->rb_lock);
  781. spin_lock_init(&buf->rb_recovery_lock);
  782. INIT_LIST_HEAD(&buf->rb_mws);
  783. INIT_LIST_HEAD(&buf->rb_all);
  784. INIT_LIST_HEAD(&buf->rb_stale_mrs);
  785. INIT_DELAYED_WORK(&buf->rb_refresh_worker,
  786. rpcrdma_mr_refresh_worker);
  787. INIT_DELAYED_WORK(&buf->rb_recovery_worker,
  788. rpcrdma_mr_recovery_worker);
  789. rpcrdma_create_mrs(r_xprt);
  790. INIT_LIST_HEAD(&buf->rb_send_bufs);
  791. INIT_LIST_HEAD(&buf->rb_allreqs);
  792. spin_lock_init(&buf->rb_reqslock);
  793. for (i = 0; i < buf->rb_max_requests; i++) {
  794. struct rpcrdma_req *req;
  795. req = rpcrdma_create_req(r_xprt);
  796. if (IS_ERR(req)) {
  797. dprintk("RPC: %s: request buffer %d alloc"
  798. " failed\n", __func__, i);
  799. rc = PTR_ERR(req);
  800. goto out;
  801. }
  802. req->rl_backchannel = false;
  803. list_add(&req->rl_free, &buf->rb_send_bufs);
  804. }
  805. INIT_LIST_HEAD(&buf->rb_recv_bufs);
  806. for (i = 0; i < buf->rb_max_requests; i++) {
  807. struct rpcrdma_rep *rep;
  808. rep = rpcrdma_create_rep(r_xprt);
  809. if (IS_ERR(rep)) {
  810. dprintk("RPC: %s: reply buffer %d alloc failed\n",
  811. __func__, i);
  812. rc = PTR_ERR(rep);
  813. goto out;
  814. }
  815. list_add(&rep->rr_list, &buf->rb_recv_bufs);
  816. }
  817. return 0;
  818. out:
  819. rpcrdma_buffer_destroy(buf);
  820. return rc;
  821. }
  822. static struct rpcrdma_req *
  823. rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
  824. {
  825. struct rpcrdma_req *req;
  826. req = list_first_entry(&buf->rb_send_bufs,
  827. struct rpcrdma_req, rl_free);
  828. list_del(&req->rl_free);
  829. return req;
  830. }
  831. static struct rpcrdma_rep *
  832. rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
  833. {
  834. struct rpcrdma_rep *rep;
  835. rep = list_first_entry(&buf->rb_recv_bufs,
  836. struct rpcrdma_rep, rr_list);
  837. list_del(&rep->rr_list);
  838. return rep;
  839. }
  840. static void
  841. rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
  842. {
  843. rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
  844. kfree(rep);
  845. }
  846. void
  847. rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
  848. {
  849. rpcrdma_free_regbuf(ia, req->rl_sendbuf);
  850. rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
  851. kfree(req);
  852. }
  853. static void
  854. rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
  855. {
  856. struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
  857. rx_buf);
  858. struct rpcrdma_ia *ia = rdmab_to_ia(buf);
  859. struct rpcrdma_mw *mw;
  860. unsigned int count;
  861. count = 0;
  862. spin_lock(&buf->rb_mwlock);
  863. while (!list_empty(&buf->rb_all)) {
  864. mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
  865. list_del(&mw->mw_all);
  866. spin_unlock(&buf->rb_mwlock);
  867. ia->ri_ops->ro_release_mr(mw);
  868. count++;
  869. spin_lock(&buf->rb_mwlock);
  870. }
  871. spin_unlock(&buf->rb_mwlock);
  872. r_xprt->rx_stats.mrs_allocated = 0;
  873. dprintk("RPC: %s: released %u MRs\n", __func__, count);
  874. }
  875. void
  876. rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  877. {
  878. struct rpcrdma_ia *ia = rdmab_to_ia(buf);
  879. cancel_delayed_work_sync(&buf->rb_recovery_worker);
  880. while (!list_empty(&buf->rb_recv_bufs)) {
  881. struct rpcrdma_rep *rep;
  882. rep = rpcrdma_buffer_get_rep_locked(buf);
  883. rpcrdma_destroy_rep(ia, rep);
  884. }
  885. spin_lock(&buf->rb_reqslock);
  886. while (!list_empty(&buf->rb_allreqs)) {
  887. struct rpcrdma_req *req;
  888. req = list_first_entry(&buf->rb_allreqs,
  889. struct rpcrdma_req, rl_all);
  890. list_del(&req->rl_all);
  891. spin_unlock(&buf->rb_reqslock);
  892. rpcrdma_destroy_req(ia, req);
  893. spin_lock(&buf->rb_reqslock);
  894. }
  895. spin_unlock(&buf->rb_reqslock);
  896. rpcrdma_destroy_mrs(buf);
  897. }
  898. struct rpcrdma_mw *
  899. rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
  900. {
  901. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  902. struct rpcrdma_mw *mw = NULL;
  903. spin_lock(&buf->rb_mwlock);
  904. if (!list_empty(&buf->rb_mws)) {
  905. mw = list_first_entry(&buf->rb_mws,
  906. struct rpcrdma_mw, mw_list);
  907. list_del_init(&mw->mw_list);
  908. }
  909. spin_unlock(&buf->rb_mwlock);
  910. if (!mw)
  911. goto out_nomws;
  912. return mw;
  913. out_nomws:
  914. dprintk("RPC: %s: no MWs available\n", __func__);
  915. schedule_delayed_work(&buf->rb_refresh_worker, 0);
  916. /* Allow the reply handler and refresh worker to run */
  917. cond_resched();
  918. return NULL;
  919. }
  920. void
  921. rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
  922. {
  923. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  924. spin_lock(&buf->rb_mwlock);
  925. list_add_tail(&mw->mw_list, &buf->rb_mws);
  926. spin_unlock(&buf->rb_mwlock);
  927. }
  928. /*
  929. * Get a set of request/reply buffers.
  930. */
  931. struct rpcrdma_req *
  932. rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  933. {
  934. struct rpcrdma_req *req;
  935. spin_lock(&buffers->rb_lock);
  936. if (list_empty(&buffers->rb_send_bufs))
  937. goto out_reqbuf;
  938. req = rpcrdma_buffer_get_req_locked(buffers);
  939. if (list_empty(&buffers->rb_recv_bufs))
  940. goto out_repbuf;
  941. req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
  942. spin_unlock(&buffers->rb_lock);
  943. return req;
  944. out_reqbuf:
  945. spin_unlock(&buffers->rb_lock);
  946. pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
  947. return NULL;
  948. out_repbuf:
  949. list_add(&req->rl_free, &buffers->rb_send_bufs);
  950. spin_unlock(&buffers->rb_lock);
  951. pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
  952. return NULL;
  953. }
  954. /*
  955. * Put request/reply buffers back into pool.
  956. * Pre-decrement counter/array index.
  957. */
  958. void
  959. rpcrdma_buffer_put(struct rpcrdma_req *req)
  960. {
  961. struct rpcrdma_buffer *buffers = req->rl_buffer;
  962. struct rpcrdma_rep *rep = req->rl_reply;
  963. req->rl_niovs = 0;
  964. req->rl_reply = NULL;
  965. spin_lock(&buffers->rb_lock);
  966. list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
  967. if (rep)
  968. list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
  969. spin_unlock(&buffers->rb_lock);
  970. }
  971. /*
  972. * Recover reply buffers from pool.
  973. * This happens when recovering from disconnect.
  974. */
  975. void
  976. rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
  977. {
  978. struct rpcrdma_buffer *buffers = req->rl_buffer;
  979. spin_lock(&buffers->rb_lock);
  980. if (!list_empty(&buffers->rb_recv_bufs))
  981. req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
  982. spin_unlock(&buffers->rb_lock);
  983. }
  984. /*
  985. * Put reply buffers back into pool when not attached to
  986. * request. This happens in error conditions.
  987. */
  988. void
  989. rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  990. {
  991. struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
  992. spin_lock(&buffers->rb_lock);
  993. list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
  994. spin_unlock(&buffers->rb_lock);
  995. }
  996. /*
  997. * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  998. */
  999. /**
  1000. * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  1001. * @ia: controlling rpcrdma_ia
  1002. * @size: size of buffer to be allocated, in bytes
  1003. * @flags: GFP flags
  1004. *
  1005. * Returns pointer to private header of an area of internally
  1006. * registered memory, or an ERR_PTR. The registered buffer follows
  1007. * the end of the private header.
  1008. *
  1009. * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
  1010. * receiving the payload of RDMA RECV operations. regbufs are not
  1011. * used for RDMA READ/WRITE operations, thus are registered only for
  1012. * LOCAL access.
  1013. */
  1014. struct rpcrdma_regbuf *
  1015. rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
  1016. {
  1017. struct rpcrdma_regbuf *rb;
  1018. struct ib_sge *iov;
  1019. rb = kmalloc(sizeof(*rb) + size, flags);
  1020. if (rb == NULL)
  1021. goto out;
  1022. iov = &rb->rg_iov;
  1023. iov->addr = ib_dma_map_single(ia->ri_device,
  1024. (void *)rb->rg_base, size,
  1025. DMA_BIDIRECTIONAL);
  1026. if (ib_dma_mapping_error(ia->ri_device, iov->addr))
  1027. goto out_free;
  1028. iov->length = size;
  1029. iov->lkey = ia->ri_pd->local_dma_lkey;
  1030. rb->rg_size = size;
  1031. rb->rg_owner = NULL;
  1032. return rb;
  1033. out_free:
  1034. kfree(rb);
  1035. out:
  1036. return ERR_PTR(-ENOMEM);
  1037. }
  1038. /**
  1039. * rpcrdma_free_regbuf - deregister and free registered buffer
  1040. * @ia: controlling rpcrdma_ia
  1041. * @rb: regbuf to be deregistered and freed
  1042. */
  1043. void
  1044. rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
  1045. {
  1046. struct ib_sge *iov;
  1047. if (!rb)
  1048. return;
  1049. iov = &rb->rg_iov;
  1050. ib_dma_unmap_single(ia->ri_device,
  1051. iov->addr, iov->length, DMA_BIDIRECTIONAL);
  1052. kfree(rb);
  1053. }
  1054. /*
  1055. * Prepost any receive buffer, then post send.
  1056. *
  1057. * Receive buffer is donated to hardware, reclaimed upon recv completion.
  1058. */
  1059. int
  1060. rpcrdma_ep_post(struct rpcrdma_ia *ia,
  1061. struct rpcrdma_ep *ep,
  1062. struct rpcrdma_req *req)
  1063. {
  1064. struct ib_device *device = ia->ri_device;
  1065. struct ib_send_wr send_wr, *send_wr_fail;
  1066. struct rpcrdma_rep *rep = req->rl_reply;
  1067. struct ib_sge *iov = req->rl_send_iov;
  1068. int i, rc;
  1069. if (rep) {
  1070. rc = rpcrdma_ep_post_recv(ia, ep, rep);
  1071. if (rc)
  1072. return rc;
  1073. req->rl_reply = NULL;
  1074. }
  1075. send_wr.next = NULL;
  1076. send_wr.wr_cqe = &req->rl_cqe;
  1077. send_wr.sg_list = iov;
  1078. send_wr.num_sge = req->rl_niovs;
  1079. send_wr.opcode = IB_WR_SEND;
  1080. for (i = 0; i < send_wr.num_sge; i++)
  1081. ib_dma_sync_single_for_device(device, iov[i].addr,
  1082. iov[i].length, DMA_TO_DEVICE);
  1083. dprintk("RPC: %s: posting %d s/g entries\n",
  1084. __func__, send_wr.num_sge);
  1085. if (DECR_CQCOUNT(ep) > 0)
  1086. send_wr.send_flags = 0;
  1087. else { /* Provider must take a send completion every now and then */
  1088. INIT_CQCOUNT(ep);
  1089. send_wr.send_flags = IB_SEND_SIGNALED;
  1090. }
  1091. rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
  1092. if (rc)
  1093. goto out_postsend_err;
  1094. return 0;
  1095. out_postsend_err:
  1096. pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
  1097. return -ENOTCONN;
  1098. }
  1099. /*
  1100. * (Re)post a receive buffer.
  1101. */
  1102. int
  1103. rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
  1104. struct rpcrdma_ep *ep,
  1105. struct rpcrdma_rep *rep)
  1106. {
  1107. struct ib_recv_wr recv_wr, *recv_wr_fail;
  1108. int rc;
  1109. recv_wr.next = NULL;
  1110. recv_wr.wr_cqe = &rep->rr_cqe;
  1111. recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
  1112. recv_wr.num_sge = 1;
  1113. ib_dma_sync_single_for_cpu(ia->ri_device,
  1114. rdmab_addr(rep->rr_rdmabuf),
  1115. rdmab_length(rep->rr_rdmabuf),
  1116. DMA_BIDIRECTIONAL);
  1117. rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
  1118. if (rc)
  1119. goto out_postrecv;
  1120. return 0;
  1121. out_postrecv:
  1122. pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
  1123. return -ENOTCONN;
  1124. }
  1125. /**
  1126. * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
  1127. * @r_xprt: transport associated with these backchannel resources
  1128. * @min_reqs: minimum number of incoming requests expected
  1129. *
  1130. * Returns zero if all requested buffers were posted, or a negative errno.
  1131. */
  1132. int
  1133. rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
  1134. {
  1135. struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
  1136. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  1137. struct rpcrdma_ep *ep = &r_xprt->rx_ep;
  1138. struct rpcrdma_rep *rep;
  1139. int rc;
  1140. while (count--) {
  1141. spin_lock(&buffers->rb_lock);
  1142. if (list_empty(&buffers->rb_recv_bufs))
  1143. goto out_reqbuf;
  1144. rep = rpcrdma_buffer_get_rep_locked(buffers);
  1145. spin_unlock(&buffers->rb_lock);
  1146. rc = rpcrdma_ep_post_recv(ia, ep, rep);
  1147. if (rc)
  1148. goto out_rc;
  1149. }
  1150. return 0;
  1151. out_reqbuf:
  1152. spin_unlock(&buffers->rb_lock);
  1153. pr_warn("%s: no extra receive buffers\n", __func__);
  1154. return -ENOMEM;
  1155. out_rc:
  1156. rpcrdma_recv_buffer_put(rep);
  1157. return rc;
  1158. }