verbs.c 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385
  1. /*
  2. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the BSD-type
  8. * license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * Redistributions in binary form must reproduce the above
  18. * copyright notice, this list of conditions and the following
  19. * disclaimer in the documentation and/or other materials provided
  20. * with the distribution.
  21. *
  22. * Neither the name of the Network Appliance, Inc. nor the names of
  23. * its contributors may be used to endorse or promote products
  24. * derived from this software without specific prior written
  25. * permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. /*
  40. * verbs.c
  41. *
  42. * Encapsulates the major functions managing:
  43. * o adapters
  44. * o endpoints
  45. * o connections
  46. * o buffer memory
  47. */
  48. #include <linux/interrupt.h>
  49. #include <linux/slab.h>
  50. #include <linux/prefetch.h>
  51. #include <linux/sunrpc/addr.h>
  52. #include <asm/bitops.h>
  53. #include <linux/module.h> /* try_module_get()/module_put() */
  54. #include "xprt_rdma.h"
  55. /*
  56. * Globals/Macros
  57. */
  58. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  59. # define RPCDBG_FACILITY RPCDBG_TRANS
  60. #endif
  61. /*
  62. * internal functions
  63. */
  64. /*
  65. * handle replies in tasklet context, using a single, global list
  66. * rdma tasklet function -- just turn around and call the func
  67. * for all replies on the list
  68. */
  69. static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
  70. static LIST_HEAD(rpcrdma_tasklets_g);
  71. static void
  72. rpcrdma_run_tasklet(unsigned long data)
  73. {
  74. struct rpcrdma_rep *rep;
  75. unsigned long flags;
  76. data = data;
  77. spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  78. while (!list_empty(&rpcrdma_tasklets_g)) {
  79. rep = list_entry(rpcrdma_tasklets_g.next,
  80. struct rpcrdma_rep, rr_list);
  81. list_del(&rep->rr_list);
  82. spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  83. rpcrdma_reply_handler(rep);
  84. spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  85. }
  86. spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  87. }
  88. static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
  89. static void
  90. rpcrdma_schedule_tasklet(struct list_head *sched_list)
  91. {
  92. unsigned long flags;
  93. spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  94. list_splice_tail(sched_list, &rpcrdma_tasklets_g);
  95. spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  96. tasklet_schedule(&rpcrdma_tasklet_g);
  97. }
  98. static void
  99. rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
  100. {
  101. struct rpcrdma_ep *ep = context;
  102. pr_err("RPC: %s: %s on device %s ep %p\n",
  103. __func__, ib_event_msg(event->event),
  104. event->device->name, context);
  105. if (ep->rep_connected == 1) {
  106. ep->rep_connected = -EIO;
  107. rpcrdma_conn_func(ep);
  108. wake_up_all(&ep->rep_connect_wait);
  109. }
  110. }
  111. static void
  112. rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
  113. {
  114. struct rpcrdma_ep *ep = context;
  115. pr_err("RPC: %s: %s on device %s ep %p\n",
  116. __func__, ib_event_msg(event->event),
  117. event->device->name, context);
  118. if (ep->rep_connected == 1) {
  119. ep->rep_connected = -EIO;
  120. rpcrdma_conn_func(ep);
  121. wake_up_all(&ep->rep_connect_wait);
  122. }
  123. }
  124. static void
  125. rpcrdma_sendcq_process_wc(struct ib_wc *wc)
  126. {
  127. /* WARNING: Only wr_id and status are reliable at this point */
  128. if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
  129. if (wc->status != IB_WC_SUCCESS &&
  130. wc->status != IB_WC_WR_FLUSH_ERR)
  131. pr_err("RPC: %s: SEND: %s\n",
  132. __func__, ib_wc_status_msg(wc->status));
  133. } else {
  134. struct rpcrdma_mw *r;
  135. r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
  136. r->mw_sendcompletion(wc);
  137. }
  138. }
  139. static int
  140. rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
  141. {
  142. struct ib_wc *wcs;
  143. int budget, count, rc;
  144. budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
  145. do {
  146. wcs = ep->rep_send_wcs;
  147. rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
  148. if (rc <= 0)
  149. return rc;
  150. count = rc;
  151. while (count-- > 0)
  152. rpcrdma_sendcq_process_wc(wcs++);
  153. } while (rc == RPCRDMA_POLLSIZE && --budget);
  154. return 0;
  155. }
  156. /*
  157. * Handle send, fast_reg_mr, and local_inv completions.
  158. *
  159. * Send events are typically suppressed and thus do not result
  160. * in an upcall. Occasionally one is signaled, however. This
  161. * prevents the provider's completion queue from wrapping and
  162. * losing a completion.
  163. */
  164. static void
  165. rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
  166. {
  167. struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
  168. int rc;
  169. rc = rpcrdma_sendcq_poll(cq, ep);
  170. if (rc) {
  171. dprintk("RPC: %s: ib_poll_cq failed: %i\n",
  172. __func__, rc);
  173. return;
  174. }
  175. rc = ib_req_notify_cq(cq,
  176. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  177. if (rc == 0)
  178. return;
  179. if (rc < 0) {
  180. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  181. __func__, rc);
  182. return;
  183. }
  184. rpcrdma_sendcq_poll(cq, ep);
  185. }
  186. static void
  187. rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
  188. {
  189. struct rpcrdma_rep *rep =
  190. (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
  191. /* WARNING: Only wr_id and status are reliable at this point */
  192. if (wc->status != IB_WC_SUCCESS)
  193. goto out_fail;
  194. /* status == SUCCESS means all fields in wc are trustworthy */
  195. if (wc->opcode != IB_WC_RECV)
  196. return;
  197. dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
  198. __func__, rep, wc->byte_len);
  199. rep->rr_len = wc->byte_len;
  200. ib_dma_sync_single_for_cpu(rep->rr_device,
  201. rdmab_addr(rep->rr_rdmabuf),
  202. rep->rr_len, DMA_FROM_DEVICE);
  203. prefetch(rdmab_to_msg(rep->rr_rdmabuf));
  204. out_schedule:
  205. list_add_tail(&rep->rr_list, sched_list);
  206. return;
  207. out_fail:
  208. if (wc->status != IB_WC_WR_FLUSH_ERR)
  209. pr_err("RPC: %s: rep %p: %s\n",
  210. __func__, rep, ib_wc_status_msg(wc->status));
  211. rep->rr_len = ~0U;
  212. goto out_schedule;
  213. }
  214. static int
  215. rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
  216. {
  217. struct list_head sched_list;
  218. struct ib_wc *wcs;
  219. int budget, count, rc;
  220. INIT_LIST_HEAD(&sched_list);
  221. budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
  222. do {
  223. wcs = ep->rep_recv_wcs;
  224. rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
  225. if (rc <= 0)
  226. goto out_schedule;
  227. count = rc;
  228. while (count-- > 0)
  229. rpcrdma_recvcq_process_wc(wcs++, &sched_list);
  230. } while (rc == RPCRDMA_POLLSIZE && --budget);
  231. rc = 0;
  232. out_schedule:
  233. rpcrdma_schedule_tasklet(&sched_list);
  234. return rc;
  235. }
  236. /*
  237. * Handle receive completions.
  238. *
  239. * It is reentrant but processes single events in order to maintain
  240. * ordering of receives to keep server credits.
  241. *
  242. * It is the responsibility of the scheduled tasklet to return
  243. * recv buffers to the pool. NOTE: this affects synchronization of
  244. * connection shutdown. That is, the structures required for
  245. * the completion of the reply handler must remain intact until
  246. * all memory has been reclaimed.
  247. */
  248. static void
  249. rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
  250. {
  251. struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
  252. int rc;
  253. rc = rpcrdma_recvcq_poll(cq, ep);
  254. if (rc) {
  255. dprintk("RPC: %s: ib_poll_cq failed: %i\n",
  256. __func__, rc);
  257. return;
  258. }
  259. rc = ib_req_notify_cq(cq,
  260. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  261. if (rc == 0)
  262. return;
  263. if (rc < 0) {
  264. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  265. __func__, rc);
  266. return;
  267. }
  268. rpcrdma_recvcq_poll(cq, ep);
  269. }
  270. static void
  271. rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
  272. {
  273. struct ib_wc wc;
  274. LIST_HEAD(sched_list);
  275. while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
  276. rpcrdma_recvcq_process_wc(&wc, &sched_list);
  277. if (!list_empty(&sched_list))
  278. rpcrdma_schedule_tasklet(&sched_list);
  279. while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
  280. rpcrdma_sendcq_process_wc(&wc);
  281. }
  282. static int
  283. rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
  284. {
  285. struct rpcrdma_xprt *xprt = id->context;
  286. struct rpcrdma_ia *ia = &xprt->rx_ia;
  287. struct rpcrdma_ep *ep = &xprt->rx_ep;
  288. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  289. struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
  290. #endif
  291. struct ib_qp_attr *attr = &ia->ri_qp_attr;
  292. struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
  293. int connstate = 0;
  294. switch (event->event) {
  295. case RDMA_CM_EVENT_ADDR_RESOLVED:
  296. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  297. ia->ri_async_rc = 0;
  298. complete(&ia->ri_done);
  299. break;
  300. case RDMA_CM_EVENT_ADDR_ERROR:
  301. ia->ri_async_rc = -EHOSTUNREACH;
  302. dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
  303. __func__, ep);
  304. complete(&ia->ri_done);
  305. break;
  306. case RDMA_CM_EVENT_ROUTE_ERROR:
  307. ia->ri_async_rc = -ENETUNREACH;
  308. dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
  309. __func__, ep);
  310. complete(&ia->ri_done);
  311. break;
  312. case RDMA_CM_EVENT_ESTABLISHED:
  313. connstate = 1;
  314. ib_query_qp(ia->ri_id->qp, attr,
  315. IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
  316. iattr);
  317. dprintk("RPC: %s: %d responder resources"
  318. " (%d initiator)\n",
  319. __func__, attr->max_dest_rd_atomic,
  320. attr->max_rd_atomic);
  321. goto connected;
  322. case RDMA_CM_EVENT_CONNECT_ERROR:
  323. connstate = -ENOTCONN;
  324. goto connected;
  325. case RDMA_CM_EVENT_UNREACHABLE:
  326. connstate = -ENETDOWN;
  327. goto connected;
  328. case RDMA_CM_EVENT_REJECTED:
  329. connstate = -ECONNREFUSED;
  330. goto connected;
  331. case RDMA_CM_EVENT_DISCONNECTED:
  332. connstate = -ECONNABORTED;
  333. goto connected;
  334. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  335. connstate = -ENODEV;
  336. connected:
  337. dprintk("RPC: %s: %sconnected\n",
  338. __func__, connstate > 0 ? "" : "dis");
  339. ep->rep_connected = connstate;
  340. rpcrdma_conn_func(ep);
  341. wake_up_all(&ep->rep_connect_wait);
  342. /*FALLTHROUGH*/
  343. default:
  344. dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
  345. __func__, sap, rpc_get_port(sap), ep,
  346. rdma_event_msg(event->event));
  347. break;
  348. }
  349. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  350. if (connstate == 1) {
  351. int ird = attr->max_dest_rd_atomic;
  352. int tird = ep->rep_remote_cma.responder_resources;
  353. pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
  354. sap, rpc_get_port(sap),
  355. ia->ri_device->name,
  356. ia->ri_ops->ro_displayname,
  357. xprt->rx_buf.rb_max_requests,
  358. ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
  359. } else if (connstate < 0) {
  360. pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
  361. sap, rpc_get_port(sap), connstate);
  362. }
  363. #endif
  364. return 0;
  365. }
  366. static void rpcrdma_destroy_id(struct rdma_cm_id *id)
  367. {
  368. if (id) {
  369. module_put(id->device->owner);
  370. rdma_destroy_id(id);
  371. }
  372. }
  373. static struct rdma_cm_id *
  374. rpcrdma_create_id(struct rpcrdma_xprt *xprt,
  375. struct rpcrdma_ia *ia, struct sockaddr *addr)
  376. {
  377. struct rdma_cm_id *id;
  378. int rc;
  379. init_completion(&ia->ri_done);
  380. id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
  381. if (IS_ERR(id)) {
  382. rc = PTR_ERR(id);
  383. dprintk("RPC: %s: rdma_create_id() failed %i\n",
  384. __func__, rc);
  385. return id;
  386. }
  387. ia->ri_async_rc = -ETIMEDOUT;
  388. rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
  389. if (rc) {
  390. dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
  391. __func__, rc);
  392. goto out;
  393. }
  394. wait_for_completion_interruptible_timeout(&ia->ri_done,
  395. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  396. /* FIXME:
  397. * Until xprtrdma supports DEVICE_REMOVAL, the provider must
  398. * be pinned while there are active NFS/RDMA mounts to prevent
  399. * hangs and crashes at umount time.
  400. */
  401. if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
  402. dprintk("RPC: %s: Failed to get device module\n",
  403. __func__);
  404. ia->ri_async_rc = -ENODEV;
  405. }
  406. rc = ia->ri_async_rc;
  407. if (rc)
  408. goto out;
  409. ia->ri_async_rc = -ETIMEDOUT;
  410. rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
  411. if (rc) {
  412. dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
  413. __func__, rc);
  414. goto put;
  415. }
  416. wait_for_completion_interruptible_timeout(&ia->ri_done,
  417. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  418. rc = ia->ri_async_rc;
  419. if (rc)
  420. goto put;
  421. return id;
  422. put:
  423. module_put(id->device->owner);
  424. out:
  425. rdma_destroy_id(id);
  426. return ERR_PTR(rc);
  427. }
  428. /*
  429. * Drain any cq, prior to teardown.
  430. */
  431. static void
  432. rpcrdma_clean_cq(struct ib_cq *cq)
  433. {
  434. struct ib_wc wc;
  435. int count = 0;
  436. while (1 == ib_poll_cq(cq, 1, &wc))
  437. ++count;
  438. if (count)
  439. dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
  440. __func__, count, wc.opcode);
  441. }
  442. /*
  443. * Exported functions.
  444. */
  445. /*
  446. * Open and initialize an Interface Adapter.
  447. * o initializes fields of struct rpcrdma_ia, including
  448. * interface and provider attributes and protection zone.
  449. */
  450. int
  451. rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  452. {
  453. struct rpcrdma_ia *ia = &xprt->rx_ia;
  454. struct ib_device_attr *devattr = &ia->ri_devattr;
  455. int rc;
  456. ia->ri_dma_mr = NULL;
  457. ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
  458. if (IS_ERR(ia->ri_id)) {
  459. rc = PTR_ERR(ia->ri_id);
  460. goto out1;
  461. }
  462. ia->ri_device = ia->ri_id->device;
  463. ia->ri_pd = ib_alloc_pd(ia->ri_device);
  464. if (IS_ERR(ia->ri_pd)) {
  465. rc = PTR_ERR(ia->ri_pd);
  466. dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
  467. __func__, rc);
  468. goto out2;
  469. }
  470. rc = ib_query_device(ia->ri_device, devattr);
  471. if (rc) {
  472. dprintk("RPC: %s: ib_query_device failed %d\n",
  473. __func__, rc);
  474. goto out3;
  475. }
  476. if (memreg == RPCRDMA_FRMR) {
  477. if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
  478. (devattr->max_fast_reg_page_list_len == 0)) {
  479. dprintk("RPC: %s: FRMR registration "
  480. "not supported by HCA\n", __func__);
  481. memreg = RPCRDMA_MTHCAFMR;
  482. }
  483. }
  484. if (memreg == RPCRDMA_MTHCAFMR) {
  485. if (!ia->ri_device->alloc_fmr) {
  486. dprintk("RPC: %s: MTHCAFMR registration "
  487. "not supported by HCA\n", __func__);
  488. rc = -EINVAL;
  489. goto out3;
  490. }
  491. }
  492. switch (memreg) {
  493. case RPCRDMA_FRMR:
  494. ia->ri_ops = &rpcrdma_frwr_memreg_ops;
  495. break;
  496. case RPCRDMA_ALLPHYSICAL:
  497. ia->ri_ops = &rpcrdma_physical_memreg_ops;
  498. break;
  499. case RPCRDMA_MTHCAFMR:
  500. ia->ri_ops = &rpcrdma_fmr_memreg_ops;
  501. break;
  502. default:
  503. printk(KERN_ERR "RPC: Unsupported memory "
  504. "registration mode: %d\n", memreg);
  505. rc = -ENOMEM;
  506. goto out3;
  507. }
  508. dprintk("RPC: %s: memory registration strategy is '%s'\n",
  509. __func__, ia->ri_ops->ro_displayname);
  510. rwlock_init(&ia->ri_qplock);
  511. return 0;
  512. out3:
  513. ib_dealloc_pd(ia->ri_pd);
  514. ia->ri_pd = NULL;
  515. out2:
  516. rpcrdma_destroy_id(ia->ri_id);
  517. ia->ri_id = NULL;
  518. out1:
  519. return rc;
  520. }
  521. /*
  522. * Clean up/close an IA.
  523. * o if event handles and PD have been initialized, free them.
  524. * o close the IA
  525. */
  526. void
  527. rpcrdma_ia_close(struct rpcrdma_ia *ia)
  528. {
  529. dprintk("RPC: %s: entering\n", __func__);
  530. if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
  531. if (ia->ri_id->qp)
  532. rdma_destroy_qp(ia->ri_id);
  533. rpcrdma_destroy_id(ia->ri_id);
  534. ia->ri_id = NULL;
  535. }
  536. /* If the pd is still busy, xprtrdma missed freeing a resource */
  537. if (ia->ri_pd && !IS_ERR(ia->ri_pd))
  538. ib_dealloc_pd(ia->ri_pd);
  539. }
  540. /*
  541. * Create unconnected endpoint.
  542. */
  543. int
  544. rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  545. struct rpcrdma_create_data_internal *cdata)
  546. {
  547. struct ib_device_attr *devattr = &ia->ri_devattr;
  548. struct ib_cq *sendcq, *recvcq;
  549. struct ib_cq_init_attr cq_attr = {};
  550. int rc, err;
  551. if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
  552. dprintk("RPC: %s: insufficient sge's available\n",
  553. __func__);
  554. return -ENOMEM;
  555. }
  556. /* check provider's send/recv wr limits */
  557. if (cdata->max_requests > devattr->max_qp_wr)
  558. cdata->max_requests = devattr->max_qp_wr;
  559. ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
  560. ep->rep_attr.qp_context = ep;
  561. ep->rep_attr.srq = NULL;
  562. ep->rep_attr.cap.max_send_wr = cdata->max_requests;
  563. rc = ia->ri_ops->ro_open(ia, ep, cdata);
  564. if (rc)
  565. return rc;
  566. ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
  567. ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
  568. ep->rep_attr.cap.max_recv_sge = 1;
  569. ep->rep_attr.cap.max_inline_data = 0;
  570. ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  571. ep->rep_attr.qp_type = IB_QPT_RC;
  572. ep->rep_attr.port_num = ~0;
  573. dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
  574. "iovs: send %d recv %d\n",
  575. __func__,
  576. ep->rep_attr.cap.max_send_wr,
  577. ep->rep_attr.cap.max_recv_wr,
  578. ep->rep_attr.cap.max_send_sge,
  579. ep->rep_attr.cap.max_recv_sge);
  580. /* set trigger for requesting send completion */
  581. ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
  582. if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
  583. ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
  584. else if (ep->rep_cqinit <= 2)
  585. ep->rep_cqinit = 0;
  586. INIT_CQCOUNT(ep);
  587. init_waitqueue_head(&ep->rep_connect_wait);
  588. INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
  589. cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
  590. sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
  591. rpcrdma_cq_async_error_upcall, ep, &cq_attr);
  592. if (IS_ERR(sendcq)) {
  593. rc = PTR_ERR(sendcq);
  594. dprintk("RPC: %s: failed to create send CQ: %i\n",
  595. __func__, rc);
  596. goto out1;
  597. }
  598. rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
  599. if (rc) {
  600. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  601. __func__, rc);
  602. goto out2;
  603. }
  604. cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
  605. recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
  606. rpcrdma_cq_async_error_upcall, ep, &cq_attr);
  607. if (IS_ERR(recvcq)) {
  608. rc = PTR_ERR(recvcq);
  609. dprintk("RPC: %s: failed to create recv CQ: %i\n",
  610. __func__, rc);
  611. goto out2;
  612. }
  613. rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
  614. if (rc) {
  615. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  616. __func__, rc);
  617. ib_destroy_cq(recvcq);
  618. goto out2;
  619. }
  620. ep->rep_attr.send_cq = sendcq;
  621. ep->rep_attr.recv_cq = recvcq;
  622. /* Initialize cma parameters */
  623. /* RPC/RDMA does not use private data */
  624. ep->rep_remote_cma.private_data = NULL;
  625. ep->rep_remote_cma.private_data_len = 0;
  626. /* Client offers RDMA Read but does not initiate */
  627. ep->rep_remote_cma.initiator_depth = 0;
  628. if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
  629. ep->rep_remote_cma.responder_resources = 32;
  630. else
  631. ep->rep_remote_cma.responder_resources =
  632. devattr->max_qp_rd_atom;
  633. ep->rep_remote_cma.retry_count = 7;
  634. ep->rep_remote_cma.flow_control = 0;
  635. ep->rep_remote_cma.rnr_retry_count = 0;
  636. return 0;
  637. out2:
  638. err = ib_destroy_cq(sendcq);
  639. if (err)
  640. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  641. __func__, err);
  642. out1:
  643. if (ia->ri_dma_mr)
  644. ib_dereg_mr(ia->ri_dma_mr);
  645. return rc;
  646. }
  647. /*
  648. * rpcrdma_ep_destroy
  649. *
  650. * Disconnect and destroy endpoint. After this, the only
  651. * valid operations on the ep are to free it (if dynamically
  652. * allocated) or re-create it.
  653. */
  654. void
  655. rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  656. {
  657. int rc;
  658. dprintk("RPC: %s: entering, connected is %d\n",
  659. __func__, ep->rep_connected);
  660. cancel_delayed_work_sync(&ep->rep_connect_worker);
  661. if (ia->ri_id->qp)
  662. rpcrdma_ep_disconnect(ep, ia);
  663. rpcrdma_clean_cq(ep->rep_attr.recv_cq);
  664. rpcrdma_clean_cq(ep->rep_attr.send_cq);
  665. if (ia->ri_id->qp) {
  666. rdma_destroy_qp(ia->ri_id);
  667. ia->ri_id->qp = NULL;
  668. }
  669. rc = ib_destroy_cq(ep->rep_attr.recv_cq);
  670. if (rc)
  671. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  672. __func__, rc);
  673. rc = ib_destroy_cq(ep->rep_attr.send_cq);
  674. if (rc)
  675. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  676. __func__, rc);
  677. if (ia->ri_dma_mr) {
  678. rc = ib_dereg_mr(ia->ri_dma_mr);
  679. dprintk("RPC: %s: ib_dereg_mr returned %i\n",
  680. __func__, rc);
  681. }
  682. }
  683. /*
  684. * Connect unconnected endpoint.
  685. */
  686. int
  687. rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  688. {
  689. struct rdma_cm_id *id, *old;
  690. int rc = 0;
  691. int retry_count = 0;
  692. if (ep->rep_connected != 0) {
  693. struct rpcrdma_xprt *xprt;
  694. retry:
  695. dprintk("RPC: %s: reconnecting...\n", __func__);
  696. rpcrdma_ep_disconnect(ep, ia);
  697. rpcrdma_flush_cqs(ep);
  698. xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
  699. id = rpcrdma_create_id(xprt, ia,
  700. (struct sockaddr *)&xprt->rx_data.addr);
  701. if (IS_ERR(id)) {
  702. rc = -EHOSTUNREACH;
  703. goto out;
  704. }
  705. /* TEMP TEMP TEMP - fail if new device:
  706. * Deregister/remarshal *all* requests!
  707. * Close and recreate adapter, pd, etc!
  708. * Re-determine all attributes still sane!
  709. * More stuff I haven't thought of!
  710. * Rrrgh!
  711. */
  712. if (ia->ri_device != id->device) {
  713. printk("RPC: %s: can't reconnect on "
  714. "different device!\n", __func__);
  715. rpcrdma_destroy_id(id);
  716. rc = -ENETUNREACH;
  717. goto out;
  718. }
  719. /* END TEMP */
  720. rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
  721. if (rc) {
  722. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  723. __func__, rc);
  724. rpcrdma_destroy_id(id);
  725. rc = -ENETUNREACH;
  726. goto out;
  727. }
  728. write_lock(&ia->ri_qplock);
  729. old = ia->ri_id;
  730. ia->ri_id = id;
  731. write_unlock(&ia->ri_qplock);
  732. rdma_destroy_qp(old);
  733. rpcrdma_destroy_id(old);
  734. } else {
  735. dprintk("RPC: %s: connecting...\n", __func__);
  736. rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
  737. if (rc) {
  738. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  739. __func__, rc);
  740. /* do not update ep->rep_connected */
  741. return -ENETUNREACH;
  742. }
  743. }
  744. ep->rep_connected = 0;
  745. rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
  746. if (rc) {
  747. dprintk("RPC: %s: rdma_connect() failed with %i\n",
  748. __func__, rc);
  749. goto out;
  750. }
  751. wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
  752. /*
  753. * Check state. A non-peer reject indicates no listener
  754. * (ECONNREFUSED), which may be a transient state. All
  755. * others indicate a transport condition which has already
  756. * undergone a best-effort.
  757. */
  758. if (ep->rep_connected == -ECONNREFUSED &&
  759. ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
  760. dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
  761. goto retry;
  762. }
  763. if (ep->rep_connected <= 0) {
  764. /* Sometimes, the only way to reliably connect to remote
  765. * CMs is to use same nonzero values for ORD and IRD. */
  766. if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
  767. (ep->rep_remote_cma.responder_resources == 0 ||
  768. ep->rep_remote_cma.initiator_depth !=
  769. ep->rep_remote_cma.responder_resources)) {
  770. if (ep->rep_remote_cma.responder_resources == 0)
  771. ep->rep_remote_cma.responder_resources = 1;
  772. ep->rep_remote_cma.initiator_depth =
  773. ep->rep_remote_cma.responder_resources;
  774. goto retry;
  775. }
  776. rc = ep->rep_connected;
  777. } else {
  778. dprintk("RPC: %s: connected\n", __func__);
  779. }
  780. out:
  781. if (rc)
  782. ep->rep_connected = rc;
  783. return rc;
  784. }
  785. /*
  786. * rpcrdma_ep_disconnect
  787. *
  788. * This is separate from destroy to facilitate the ability
  789. * to reconnect without recreating the endpoint.
  790. *
  791. * This call is not reentrant, and must not be made in parallel
  792. * on the same endpoint.
  793. */
  794. void
  795. rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  796. {
  797. int rc;
  798. rpcrdma_flush_cqs(ep);
  799. rc = rdma_disconnect(ia->ri_id);
  800. if (!rc) {
  801. /* returns without wait if not connected */
  802. wait_event_interruptible(ep->rep_connect_wait,
  803. ep->rep_connected != 1);
  804. dprintk("RPC: %s: after wait, %sconnected\n", __func__,
  805. (ep->rep_connected == 1) ? "still " : "dis");
  806. } else {
  807. dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
  808. ep->rep_connected = rc;
  809. }
  810. }
  811. static struct rpcrdma_req *
  812. rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
  813. {
  814. struct rpcrdma_req *req;
  815. req = kzalloc(sizeof(*req), GFP_KERNEL);
  816. if (req == NULL)
  817. return ERR_PTR(-ENOMEM);
  818. req->rl_buffer = &r_xprt->rx_buf;
  819. return req;
  820. }
  821. static struct rpcrdma_rep *
  822. rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
  823. {
  824. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  825. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  826. struct rpcrdma_rep *rep;
  827. int rc;
  828. rc = -ENOMEM;
  829. rep = kzalloc(sizeof(*rep), GFP_KERNEL);
  830. if (rep == NULL)
  831. goto out;
  832. rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
  833. GFP_KERNEL);
  834. if (IS_ERR(rep->rr_rdmabuf)) {
  835. rc = PTR_ERR(rep->rr_rdmabuf);
  836. goto out_free;
  837. }
  838. rep->rr_device = ia->ri_device;
  839. rep->rr_rxprt = r_xprt;
  840. return rep;
  841. out_free:
  842. kfree(rep);
  843. out:
  844. return ERR_PTR(rc);
  845. }
  846. int
  847. rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
  848. {
  849. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  850. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  851. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  852. char *p;
  853. size_t len;
  854. int i, rc;
  855. buf->rb_max_requests = cdata->max_requests;
  856. spin_lock_init(&buf->rb_lock);
  857. /* Need to allocate:
  858. * 1. arrays for send and recv pointers
  859. * 2. arrays of struct rpcrdma_req to fill in pointers
  860. * 3. array of struct rpcrdma_rep for replies
  861. * Send/recv buffers in req/rep need to be registered
  862. */
  863. len = buf->rb_max_requests *
  864. (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
  865. p = kzalloc(len, GFP_KERNEL);
  866. if (p == NULL) {
  867. dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
  868. __func__, len);
  869. rc = -ENOMEM;
  870. goto out;
  871. }
  872. buf->rb_pool = p; /* for freeing it later */
  873. buf->rb_send_bufs = (struct rpcrdma_req **) p;
  874. p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
  875. buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
  876. p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
  877. rc = ia->ri_ops->ro_init(r_xprt);
  878. if (rc)
  879. goto out;
  880. for (i = 0; i < buf->rb_max_requests; i++) {
  881. struct rpcrdma_req *req;
  882. struct rpcrdma_rep *rep;
  883. req = rpcrdma_create_req(r_xprt);
  884. if (IS_ERR(req)) {
  885. dprintk("RPC: %s: request buffer %d alloc"
  886. " failed\n", __func__, i);
  887. rc = PTR_ERR(req);
  888. goto out;
  889. }
  890. buf->rb_send_bufs[i] = req;
  891. rep = rpcrdma_create_rep(r_xprt);
  892. if (IS_ERR(rep)) {
  893. dprintk("RPC: %s: reply buffer %d alloc failed\n",
  894. __func__, i);
  895. rc = PTR_ERR(rep);
  896. goto out;
  897. }
  898. buf->rb_recv_bufs[i] = rep;
  899. }
  900. return 0;
  901. out:
  902. rpcrdma_buffer_destroy(buf);
  903. return rc;
  904. }
  905. static void
  906. rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
  907. {
  908. if (!rep)
  909. return;
  910. rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
  911. kfree(rep);
  912. }
  913. static void
  914. rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
  915. {
  916. if (!req)
  917. return;
  918. rpcrdma_free_regbuf(ia, req->rl_sendbuf);
  919. rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
  920. kfree(req);
  921. }
  922. void
  923. rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  924. {
  925. struct rpcrdma_ia *ia = rdmab_to_ia(buf);
  926. int i;
  927. /* clean up in reverse order from create
  928. * 1. recv mr memory (mr free, then kfree)
  929. * 2. send mr memory (mr free, then kfree)
  930. * 3. MWs
  931. */
  932. dprintk("RPC: %s: entering\n", __func__);
  933. for (i = 0; i < buf->rb_max_requests; i++) {
  934. if (buf->rb_recv_bufs)
  935. rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
  936. if (buf->rb_send_bufs)
  937. rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
  938. }
  939. ia->ri_ops->ro_destroy(buf);
  940. kfree(buf->rb_pool);
  941. }
  942. struct rpcrdma_mw *
  943. rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
  944. {
  945. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  946. struct rpcrdma_mw *mw = NULL;
  947. spin_lock(&buf->rb_mwlock);
  948. if (!list_empty(&buf->rb_mws)) {
  949. mw = list_first_entry(&buf->rb_mws,
  950. struct rpcrdma_mw, mw_list);
  951. list_del_init(&mw->mw_list);
  952. }
  953. spin_unlock(&buf->rb_mwlock);
  954. if (!mw)
  955. pr_err("RPC: %s: no MWs available\n", __func__);
  956. return mw;
  957. }
  958. void
  959. rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
  960. {
  961. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  962. spin_lock(&buf->rb_mwlock);
  963. list_add_tail(&mw->mw_list, &buf->rb_mws);
  964. spin_unlock(&buf->rb_mwlock);
  965. }
  966. static void
  967. rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
  968. {
  969. buf->rb_send_bufs[--buf->rb_send_index] = req;
  970. req->rl_niovs = 0;
  971. if (req->rl_reply) {
  972. buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
  973. req->rl_reply = NULL;
  974. }
  975. }
  976. /*
  977. * Get a set of request/reply buffers.
  978. *
  979. * Reply buffer (if needed) is attached to send buffer upon return.
  980. * Rule:
  981. * rb_send_index and rb_recv_index MUST always be pointing to the
  982. * *next* available buffer (non-NULL). They are incremented after
  983. * removing buffers, and decremented *before* returning them.
  984. */
  985. struct rpcrdma_req *
  986. rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  987. {
  988. struct rpcrdma_req *req;
  989. unsigned long flags;
  990. spin_lock_irqsave(&buffers->rb_lock, flags);
  991. if (buffers->rb_send_index == buffers->rb_max_requests) {
  992. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  993. dprintk("RPC: %s: out of request buffers\n", __func__);
  994. return ((struct rpcrdma_req *)NULL);
  995. }
  996. req = buffers->rb_send_bufs[buffers->rb_send_index];
  997. if (buffers->rb_send_index < buffers->rb_recv_index) {
  998. dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
  999. __func__,
  1000. buffers->rb_recv_index - buffers->rb_send_index);
  1001. req->rl_reply = NULL;
  1002. } else {
  1003. req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
  1004. buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
  1005. }
  1006. buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
  1007. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1008. return req;
  1009. }
  1010. /*
  1011. * Put request/reply buffers back into pool.
  1012. * Pre-decrement counter/array index.
  1013. */
  1014. void
  1015. rpcrdma_buffer_put(struct rpcrdma_req *req)
  1016. {
  1017. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1018. unsigned long flags;
  1019. spin_lock_irqsave(&buffers->rb_lock, flags);
  1020. rpcrdma_buffer_put_sendbuf(req, buffers);
  1021. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1022. }
  1023. /*
  1024. * Recover reply buffers from pool.
  1025. * This happens when recovering from error conditions.
  1026. * Post-increment counter/array index.
  1027. */
  1028. void
  1029. rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
  1030. {
  1031. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1032. unsigned long flags;
  1033. spin_lock_irqsave(&buffers->rb_lock, flags);
  1034. if (buffers->rb_recv_index < buffers->rb_max_requests) {
  1035. req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
  1036. buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
  1037. }
  1038. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1039. }
  1040. /*
  1041. * Put reply buffers back into pool when not attached to
  1042. * request. This happens in error conditions.
  1043. */
  1044. void
  1045. rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  1046. {
  1047. struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
  1048. unsigned long flags;
  1049. spin_lock_irqsave(&buffers->rb_lock, flags);
  1050. buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
  1051. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1052. }
  1053. /*
  1054. * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  1055. */
  1056. void
  1057. rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
  1058. {
  1059. dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
  1060. seg->mr_offset,
  1061. (unsigned long long)seg->mr_dma, seg->mr_dmalen);
  1062. }
  1063. /**
  1064. * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  1065. * @ia: controlling rpcrdma_ia
  1066. * @size: size of buffer to be allocated, in bytes
  1067. * @flags: GFP flags
  1068. *
  1069. * Returns pointer to private header of an area of internally
  1070. * registered memory, or an ERR_PTR. The registered buffer follows
  1071. * the end of the private header.
  1072. *
  1073. * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
  1074. * receiving the payload of RDMA RECV operations. regbufs are not
  1075. * used for RDMA READ/WRITE operations, thus are registered only for
  1076. * LOCAL access.
  1077. */
  1078. struct rpcrdma_regbuf *
  1079. rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
  1080. {
  1081. struct rpcrdma_regbuf *rb;
  1082. struct ib_sge *iov;
  1083. rb = kmalloc(sizeof(*rb) + size, flags);
  1084. if (rb == NULL)
  1085. goto out;
  1086. iov = &rb->rg_iov;
  1087. iov->addr = ib_dma_map_single(ia->ri_device,
  1088. (void *)rb->rg_base, size,
  1089. DMA_BIDIRECTIONAL);
  1090. if (ib_dma_mapping_error(ia->ri_device, iov->addr))
  1091. goto out_free;
  1092. iov->length = size;
  1093. iov->lkey = ia->ri_pd->local_dma_lkey;
  1094. rb->rg_size = size;
  1095. rb->rg_owner = NULL;
  1096. return rb;
  1097. out_free:
  1098. kfree(rb);
  1099. out:
  1100. return ERR_PTR(-ENOMEM);
  1101. }
  1102. /**
  1103. * rpcrdma_free_regbuf - deregister and free registered buffer
  1104. * @ia: controlling rpcrdma_ia
  1105. * @rb: regbuf to be deregistered and freed
  1106. */
  1107. void
  1108. rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
  1109. {
  1110. struct ib_sge *iov;
  1111. if (!rb)
  1112. return;
  1113. iov = &rb->rg_iov;
  1114. ib_dma_unmap_single(ia->ri_device,
  1115. iov->addr, iov->length, DMA_BIDIRECTIONAL);
  1116. kfree(rb);
  1117. }
  1118. /*
  1119. * Prepost any receive buffer, then post send.
  1120. *
  1121. * Receive buffer is donated to hardware, reclaimed upon recv completion.
  1122. */
  1123. int
  1124. rpcrdma_ep_post(struct rpcrdma_ia *ia,
  1125. struct rpcrdma_ep *ep,
  1126. struct rpcrdma_req *req)
  1127. {
  1128. struct ib_device *device = ia->ri_device;
  1129. struct ib_send_wr send_wr, *send_wr_fail;
  1130. struct rpcrdma_rep *rep = req->rl_reply;
  1131. struct ib_sge *iov = req->rl_send_iov;
  1132. int i, rc;
  1133. if (rep) {
  1134. rc = rpcrdma_ep_post_recv(ia, ep, rep);
  1135. if (rc)
  1136. goto out;
  1137. req->rl_reply = NULL;
  1138. }
  1139. send_wr.next = NULL;
  1140. send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
  1141. send_wr.sg_list = iov;
  1142. send_wr.num_sge = req->rl_niovs;
  1143. send_wr.opcode = IB_WR_SEND;
  1144. for (i = 0; i < send_wr.num_sge; i++)
  1145. ib_dma_sync_single_for_device(device, iov[i].addr,
  1146. iov[i].length, DMA_TO_DEVICE);
  1147. dprintk("RPC: %s: posting %d s/g entries\n",
  1148. __func__, send_wr.num_sge);
  1149. if (DECR_CQCOUNT(ep) > 0)
  1150. send_wr.send_flags = 0;
  1151. else { /* Provider must take a send completion every now and then */
  1152. INIT_CQCOUNT(ep);
  1153. send_wr.send_flags = IB_SEND_SIGNALED;
  1154. }
  1155. rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
  1156. if (rc)
  1157. dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
  1158. rc);
  1159. out:
  1160. return rc;
  1161. }
  1162. /*
  1163. * (Re)post a receive buffer.
  1164. */
  1165. int
  1166. rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
  1167. struct rpcrdma_ep *ep,
  1168. struct rpcrdma_rep *rep)
  1169. {
  1170. struct ib_recv_wr recv_wr, *recv_wr_fail;
  1171. int rc;
  1172. recv_wr.next = NULL;
  1173. recv_wr.wr_id = (u64) (unsigned long) rep;
  1174. recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
  1175. recv_wr.num_sge = 1;
  1176. ib_dma_sync_single_for_cpu(ia->ri_device,
  1177. rdmab_addr(rep->rr_rdmabuf),
  1178. rdmab_length(rep->rr_rdmabuf),
  1179. DMA_BIDIRECTIONAL);
  1180. rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
  1181. if (rc)
  1182. dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
  1183. rc);
  1184. return rc;
  1185. }
  1186. /* How many chunk list items fit within our inline buffers?
  1187. */
  1188. unsigned int
  1189. rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
  1190. {
  1191. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  1192. int bytes, segments;
  1193. bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
  1194. bytes -= RPCRDMA_HDRLEN_MIN;
  1195. if (bytes < sizeof(struct rpcrdma_segment) * 2) {
  1196. pr_warn("RPC: %s: inline threshold too small\n",
  1197. __func__);
  1198. return 0;
  1199. }
  1200. segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
  1201. dprintk("RPC: %s: max chunk list size = %d segments\n",
  1202. __func__, segments);
  1203. return segments;
  1204. }