rdma.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542
  1. /*
  2. * NVMe over Fabrics RDMA target.
  3. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15. #include <linux/atomic.h>
  16. #include <linux/ctype.h>
  17. #include <linux/delay.h>
  18. #include <linux/err.h>
  19. #include <linux/init.h>
  20. #include <linux/module.h>
  21. #include <linux/nvme.h>
  22. #include <linux/slab.h>
  23. #include <linux/string.h>
  24. #include <linux/wait.h>
  25. #include <linux/inet.h>
  26. #include <asm/unaligned.h>
  27. #include <rdma/ib_verbs.h>
  28. #include <rdma/rdma_cm.h>
  29. #include <rdma/rw.h>
  30. #include <linux/nvme-rdma.h>
  31. #include "nvmet.h"
  32. /*
  33. * We allow up to a page of inline data to go with the SQE
  34. */
  35. #define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE
  36. struct nvmet_rdma_cmd {
  37. struct ib_sge sge[2];
  38. struct ib_cqe cqe;
  39. struct ib_recv_wr wr;
  40. struct scatterlist inline_sg;
  41. struct page *inline_page;
  42. struct nvme_command *nvme_cmd;
  43. struct nvmet_rdma_queue *queue;
  44. };
  45. enum {
  46. NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
  47. NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
  48. };
  49. struct nvmet_rdma_rsp {
  50. struct ib_sge send_sge;
  51. struct ib_cqe send_cqe;
  52. struct ib_send_wr send_wr;
  53. struct nvmet_rdma_cmd *cmd;
  54. struct nvmet_rdma_queue *queue;
  55. struct ib_cqe read_cqe;
  56. struct rdma_rw_ctx rw;
  57. struct nvmet_req req;
  58. u8 n_rdma;
  59. u32 flags;
  60. u32 invalidate_rkey;
  61. struct list_head wait_list;
  62. struct list_head free_list;
  63. };
  64. enum nvmet_rdma_queue_state {
  65. NVMET_RDMA_Q_CONNECTING,
  66. NVMET_RDMA_Q_LIVE,
  67. NVMET_RDMA_Q_DISCONNECTING,
  68. };
  69. struct nvmet_rdma_queue {
  70. struct rdma_cm_id *cm_id;
  71. struct nvmet_port *port;
  72. struct ib_cq *cq;
  73. atomic_t sq_wr_avail;
  74. struct nvmet_rdma_device *dev;
  75. spinlock_t state_lock;
  76. enum nvmet_rdma_queue_state state;
  77. struct nvmet_cq nvme_cq;
  78. struct nvmet_sq nvme_sq;
  79. struct nvmet_rdma_rsp *rsps;
  80. struct list_head free_rsps;
  81. spinlock_t rsps_lock;
  82. struct nvmet_rdma_cmd *cmds;
  83. struct work_struct release_work;
  84. struct list_head rsp_wait_list;
  85. struct list_head rsp_wr_wait_list;
  86. spinlock_t rsp_wr_wait_lock;
  87. int idx;
  88. int host_qid;
  89. int recv_queue_size;
  90. int send_queue_size;
  91. struct list_head queue_list;
  92. };
  93. struct nvmet_rdma_device {
  94. struct ib_device *device;
  95. struct ib_pd *pd;
  96. struct ib_srq *srq;
  97. struct nvmet_rdma_cmd *srq_cmds;
  98. size_t srq_size;
  99. struct kref ref;
  100. struct list_head entry;
  101. };
  102. static bool nvmet_rdma_use_srq;
  103. module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
  104. MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
  105. static DEFINE_IDA(nvmet_rdma_queue_ida);
  106. static LIST_HEAD(nvmet_rdma_queue_list);
  107. static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
  108. static LIST_HEAD(device_list);
  109. static DEFINE_MUTEX(device_list_mutex);
  110. static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
  111. static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
  112. static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
  113. static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
  114. static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
  115. static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
  116. static const struct nvmet_fabrics_ops nvmet_rdma_ops;
  117. /* XXX: really should move to a generic header sooner or later.. */
  118. static inline u32 get_unaligned_le24(const u8 *p)
  119. {
  120. return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
  121. }
  122. static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
  123. {
  124. return nvme_is_write(rsp->req.cmd) &&
  125. rsp->req.transfer_len &&
  126. !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
  127. }
  128. static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
  129. {
  130. return !nvme_is_write(rsp->req.cmd) &&
  131. rsp->req.transfer_len &&
  132. !rsp->req.rsp->status &&
  133. !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
  134. }
  135. static inline struct nvmet_rdma_rsp *
  136. nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
  137. {
  138. struct nvmet_rdma_rsp *rsp;
  139. unsigned long flags;
  140. spin_lock_irqsave(&queue->rsps_lock, flags);
  141. rsp = list_first_entry(&queue->free_rsps,
  142. struct nvmet_rdma_rsp, free_list);
  143. list_del(&rsp->free_list);
  144. spin_unlock_irqrestore(&queue->rsps_lock, flags);
  145. return rsp;
  146. }
  147. static inline void
  148. nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
  149. {
  150. unsigned long flags;
  151. spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
  152. list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
  153. spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
  154. }
  155. static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
  156. struct nvmet_rdma_cmd *c, bool admin)
  157. {
  158. /* NVMe command / RDMA RECV */
  159. c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
  160. if (!c->nvme_cmd)
  161. goto out;
  162. c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
  163. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  164. if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
  165. goto out_free_cmd;
  166. c->sge[0].length = sizeof(*c->nvme_cmd);
  167. c->sge[0].lkey = ndev->pd->local_dma_lkey;
  168. if (!admin) {
  169. c->inline_page = alloc_pages(GFP_KERNEL,
  170. get_order(NVMET_RDMA_INLINE_DATA_SIZE));
  171. if (!c->inline_page)
  172. goto out_unmap_cmd;
  173. c->sge[1].addr = ib_dma_map_page(ndev->device,
  174. c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
  175. DMA_FROM_DEVICE);
  176. if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
  177. goto out_free_inline_page;
  178. c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
  179. c->sge[1].lkey = ndev->pd->local_dma_lkey;
  180. }
  181. c->cqe.done = nvmet_rdma_recv_done;
  182. c->wr.wr_cqe = &c->cqe;
  183. c->wr.sg_list = c->sge;
  184. c->wr.num_sge = admin ? 1 : 2;
  185. return 0;
  186. out_free_inline_page:
  187. if (!admin) {
  188. __free_pages(c->inline_page,
  189. get_order(NVMET_RDMA_INLINE_DATA_SIZE));
  190. }
  191. out_unmap_cmd:
  192. ib_dma_unmap_single(ndev->device, c->sge[0].addr,
  193. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  194. out_free_cmd:
  195. kfree(c->nvme_cmd);
  196. out:
  197. return -ENOMEM;
  198. }
  199. static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
  200. struct nvmet_rdma_cmd *c, bool admin)
  201. {
  202. if (!admin) {
  203. ib_dma_unmap_page(ndev->device, c->sge[1].addr,
  204. NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
  205. __free_pages(c->inline_page,
  206. get_order(NVMET_RDMA_INLINE_DATA_SIZE));
  207. }
  208. ib_dma_unmap_single(ndev->device, c->sge[0].addr,
  209. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  210. kfree(c->nvme_cmd);
  211. }
  212. static struct nvmet_rdma_cmd *
  213. nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
  214. int nr_cmds, bool admin)
  215. {
  216. struct nvmet_rdma_cmd *cmds;
  217. int ret = -EINVAL, i;
  218. cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
  219. if (!cmds)
  220. goto out;
  221. for (i = 0; i < nr_cmds; i++) {
  222. ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
  223. if (ret)
  224. goto out_free;
  225. }
  226. return cmds;
  227. out_free:
  228. while (--i >= 0)
  229. nvmet_rdma_free_cmd(ndev, cmds + i, admin);
  230. kfree(cmds);
  231. out:
  232. return ERR_PTR(ret);
  233. }
  234. static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
  235. struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
  236. {
  237. int i;
  238. for (i = 0; i < nr_cmds; i++)
  239. nvmet_rdma_free_cmd(ndev, cmds + i, admin);
  240. kfree(cmds);
  241. }
  242. static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
  243. struct nvmet_rdma_rsp *r)
  244. {
  245. /* NVMe CQE / RDMA SEND */
  246. r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
  247. if (!r->req.rsp)
  248. goto out;
  249. r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
  250. sizeof(*r->req.rsp), DMA_TO_DEVICE);
  251. if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
  252. goto out_free_rsp;
  253. r->send_sge.length = sizeof(*r->req.rsp);
  254. r->send_sge.lkey = ndev->pd->local_dma_lkey;
  255. r->send_cqe.done = nvmet_rdma_send_done;
  256. r->send_wr.wr_cqe = &r->send_cqe;
  257. r->send_wr.sg_list = &r->send_sge;
  258. r->send_wr.num_sge = 1;
  259. r->send_wr.send_flags = IB_SEND_SIGNALED;
  260. /* Data In / RDMA READ */
  261. r->read_cqe.done = nvmet_rdma_read_data_done;
  262. return 0;
  263. out_free_rsp:
  264. kfree(r->req.rsp);
  265. out:
  266. return -ENOMEM;
  267. }
  268. static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
  269. struct nvmet_rdma_rsp *r)
  270. {
  271. ib_dma_unmap_single(ndev->device, r->send_sge.addr,
  272. sizeof(*r->req.rsp), DMA_TO_DEVICE);
  273. kfree(r->req.rsp);
  274. }
  275. static int
  276. nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
  277. {
  278. struct nvmet_rdma_device *ndev = queue->dev;
  279. int nr_rsps = queue->recv_queue_size * 2;
  280. int ret = -EINVAL, i;
  281. queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
  282. GFP_KERNEL);
  283. if (!queue->rsps)
  284. goto out;
  285. for (i = 0; i < nr_rsps; i++) {
  286. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  287. ret = nvmet_rdma_alloc_rsp(ndev, rsp);
  288. if (ret)
  289. goto out_free;
  290. list_add_tail(&rsp->free_list, &queue->free_rsps);
  291. }
  292. return 0;
  293. out_free:
  294. while (--i >= 0) {
  295. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  296. list_del(&rsp->free_list);
  297. nvmet_rdma_free_rsp(ndev, rsp);
  298. }
  299. kfree(queue->rsps);
  300. out:
  301. return ret;
  302. }
  303. static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
  304. {
  305. struct nvmet_rdma_device *ndev = queue->dev;
  306. int i, nr_rsps = queue->recv_queue_size * 2;
  307. for (i = 0; i < nr_rsps; i++) {
  308. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  309. list_del(&rsp->free_list);
  310. nvmet_rdma_free_rsp(ndev, rsp);
  311. }
  312. kfree(queue->rsps);
  313. }
  314. static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
  315. struct nvmet_rdma_cmd *cmd)
  316. {
  317. struct ib_recv_wr *bad_wr;
  318. ib_dma_sync_single_for_device(ndev->device,
  319. cmd->sge[0].addr, cmd->sge[0].length,
  320. DMA_FROM_DEVICE);
  321. if (ndev->srq)
  322. return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
  323. return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
  324. }
  325. static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
  326. {
  327. spin_lock(&queue->rsp_wr_wait_lock);
  328. while (!list_empty(&queue->rsp_wr_wait_list)) {
  329. struct nvmet_rdma_rsp *rsp;
  330. bool ret;
  331. rsp = list_entry(queue->rsp_wr_wait_list.next,
  332. struct nvmet_rdma_rsp, wait_list);
  333. list_del(&rsp->wait_list);
  334. spin_unlock(&queue->rsp_wr_wait_lock);
  335. ret = nvmet_rdma_execute_command(rsp);
  336. spin_lock(&queue->rsp_wr_wait_lock);
  337. if (!ret) {
  338. list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
  339. break;
  340. }
  341. }
  342. spin_unlock(&queue->rsp_wr_wait_lock);
  343. }
  344. static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
  345. {
  346. struct nvmet_rdma_queue *queue = rsp->queue;
  347. atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
  348. if (rsp->n_rdma) {
  349. rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
  350. queue->cm_id->port_num, rsp->req.sg,
  351. rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
  352. }
  353. if (rsp->req.sg != &rsp->cmd->inline_sg)
  354. sgl_free(rsp->req.sg);
  355. if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
  356. nvmet_rdma_process_wr_wait_list(queue);
  357. nvmet_rdma_put_rsp(rsp);
  358. }
  359. static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
  360. {
  361. if (queue->nvme_sq.ctrl) {
  362. nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
  363. } else {
  364. /*
  365. * we didn't setup the controller yet in case
  366. * of admin connect error, just disconnect and
  367. * cleanup the queue
  368. */
  369. nvmet_rdma_queue_disconnect(queue);
  370. }
  371. }
  372. static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
  373. {
  374. struct nvmet_rdma_rsp *rsp =
  375. container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
  376. nvmet_rdma_release_rsp(rsp);
  377. if (unlikely(wc->status != IB_WC_SUCCESS &&
  378. wc->status != IB_WC_WR_FLUSH_ERR)) {
  379. pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
  380. wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
  381. nvmet_rdma_error_comp(rsp->queue);
  382. }
  383. }
  384. static void nvmet_rdma_queue_response(struct nvmet_req *req)
  385. {
  386. struct nvmet_rdma_rsp *rsp =
  387. container_of(req, struct nvmet_rdma_rsp, req);
  388. struct rdma_cm_id *cm_id = rsp->queue->cm_id;
  389. struct ib_send_wr *first_wr, *bad_wr;
  390. if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
  391. rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
  392. rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
  393. } else {
  394. rsp->send_wr.opcode = IB_WR_SEND;
  395. }
  396. if (nvmet_rdma_need_data_out(rsp))
  397. first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
  398. cm_id->port_num, NULL, &rsp->send_wr);
  399. else
  400. first_wr = &rsp->send_wr;
  401. nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
  402. ib_dma_sync_single_for_device(rsp->queue->dev->device,
  403. rsp->send_sge.addr, rsp->send_sge.length,
  404. DMA_TO_DEVICE);
  405. if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
  406. pr_err("sending cmd response failed\n");
  407. nvmet_rdma_release_rsp(rsp);
  408. }
  409. }
  410. static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
  411. {
  412. struct nvmet_rdma_rsp *rsp =
  413. container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
  414. struct nvmet_rdma_queue *queue = cq->cq_context;
  415. WARN_ON(rsp->n_rdma <= 0);
  416. atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
  417. rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
  418. queue->cm_id->port_num, rsp->req.sg,
  419. rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
  420. rsp->n_rdma = 0;
  421. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  422. nvmet_req_uninit(&rsp->req);
  423. nvmet_rdma_release_rsp(rsp);
  424. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  425. pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
  426. wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
  427. nvmet_rdma_error_comp(queue);
  428. }
  429. return;
  430. }
  431. nvmet_req_execute(&rsp->req);
  432. }
  433. static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
  434. u64 off)
  435. {
  436. sg_init_table(&rsp->cmd->inline_sg, 1);
  437. sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
  438. rsp->req.sg = &rsp->cmd->inline_sg;
  439. rsp->req.sg_cnt = 1;
  440. }
  441. static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
  442. {
  443. struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
  444. u64 off = le64_to_cpu(sgl->addr);
  445. u32 len = le32_to_cpu(sgl->length);
  446. if (!nvme_is_write(rsp->req.cmd))
  447. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  448. if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
  449. pr_err("invalid inline data offset!\n");
  450. return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
  451. }
  452. /* no data command? */
  453. if (!len)
  454. return 0;
  455. nvmet_rdma_use_inline_sg(rsp, len, off);
  456. rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
  457. rsp->req.transfer_len += len;
  458. return 0;
  459. }
  460. static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
  461. struct nvme_keyed_sgl_desc *sgl, bool invalidate)
  462. {
  463. struct rdma_cm_id *cm_id = rsp->queue->cm_id;
  464. u64 addr = le64_to_cpu(sgl->addr);
  465. u32 len = get_unaligned_le24(sgl->length);
  466. u32 key = get_unaligned_le32(sgl->key);
  467. int ret;
  468. /* no data command? */
  469. if (!len)
  470. return 0;
  471. rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
  472. if (!rsp->req.sg)
  473. return NVME_SC_INTERNAL;
  474. ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
  475. rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
  476. nvmet_data_dir(&rsp->req));
  477. if (ret < 0)
  478. return NVME_SC_INTERNAL;
  479. rsp->req.transfer_len += len;
  480. rsp->n_rdma += ret;
  481. if (invalidate) {
  482. rsp->invalidate_rkey = key;
  483. rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
  484. }
  485. return 0;
  486. }
  487. static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
  488. {
  489. struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
  490. switch (sgl->type >> 4) {
  491. case NVME_SGL_FMT_DATA_DESC:
  492. switch (sgl->type & 0xf) {
  493. case NVME_SGL_FMT_OFFSET:
  494. return nvmet_rdma_map_sgl_inline(rsp);
  495. default:
  496. pr_err("invalid SGL subtype: %#x\n", sgl->type);
  497. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  498. }
  499. case NVME_KEY_SGL_FMT_DATA_DESC:
  500. switch (sgl->type & 0xf) {
  501. case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
  502. return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
  503. case NVME_SGL_FMT_ADDRESS:
  504. return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
  505. default:
  506. pr_err("invalid SGL subtype: %#x\n", sgl->type);
  507. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  508. }
  509. default:
  510. pr_err("invalid SGL type: %#x\n", sgl->type);
  511. return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
  512. }
  513. }
  514. static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
  515. {
  516. struct nvmet_rdma_queue *queue = rsp->queue;
  517. if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
  518. &queue->sq_wr_avail) < 0)) {
  519. pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
  520. 1 + rsp->n_rdma, queue->idx,
  521. queue->nvme_sq.ctrl->cntlid);
  522. atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
  523. return false;
  524. }
  525. if (nvmet_rdma_need_data_in(rsp)) {
  526. if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
  527. queue->cm_id->port_num, &rsp->read_cqe, NULL))
  528. nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
  529. } else {
  530. nvmet_req_execute(&rsp->req);
  531. }
  532. return true;
  533. }
  534. static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
  535. struct nvmet_rdma_rsp *cmd)
  536. {
  537. u16 status;
  538. ib_dma_sync_single_for_cpu(queue->dev->device,
  539. cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
  540. DMA_FROM_DEVICE);
  541. ib_dma_sync_single_for_cpu(queue->dev->device,
  542. cmd->send_sge.addr, cmd->send_sge.length,
  543. DMA_TO_DEVICE);
  544. if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
  545. &queue->nvme_sq, &nvmet_rdma_ops))
  546. return;
  547. status = nvmet_rdma_map_sgl(cmd);
  548. if (status)
  549. goto out_err;
  550. if (unlikely(!nvmet_rdma_execute_command(cmd))) {
  551. spin_lock(&queue->rsp_wr_wait_lock);
  552. list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
  553. spin_unlock(&queue->rsp_wr_wait_lock);
  554. }
  555. return;
  556. out_err:
  557. nvmet_req_complete(&cmd->req, status);
  558. }
  559. static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
  560. {
  561. struct nvmet_rdma_cmd *cmd =
  562. container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
  563. struct nvmet_rdma_queue *queue = cq->cq_context;
  564. struct nvmet_rdma_rsp *rsp;
  565. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  566. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  567. pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
  568. wc->wr_cqe, ib_wc_status_msg(wc->status),
  569. wc->status);
  570. nvmet_rdma_error_comp(queue);
  571. }
  572. return;
  573. }
  574. if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
  575. pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
  576. nvmet_rdma_error_comp(queue);
  577. return;
  578. }
  579. cmd->queue = queue;
  580. rsp = nvmet_rdma_get_rsp(queue);
  581. rsp->queue = queue;
  582. rsp->cmd = cmd;
  583. rsp->flags = 0;
  584. rsp->req.cmd = cmd->nvme_cmd;
  585. rsp->req.port = queue->port;
  586. rsp->n_rdma = 0;
  587. if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
  588. unsigned long flags;
  589. spin_lock_irqsave(&queue->state_lock, flags);
  590. if (queue->state == NVMET_RDMA_Q_CONNECTING)
  591. list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
  592. else
  593. nvmet_rdma_put_rsp(rsp);
  594. spin_unlock_irqrestore(&queue->state_lock, flags);
  595. return;
  596. }
  597. nvmet_rdma_handle_command(queue, rsp);
  598. }
  599. static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
  600. {
  601. if (!ndev->srq)
  602. return;
  603. nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
  604. ib_destroy_srq(ndev->srq);
  605. }
  606. static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
  607. {
  608. struct ib_srq_init_attr srq_attr = { NULL, };
  609. struct ib_srq *srq;
  610. size_t srq_size;
  611. int ret, i;
  612. srq_size = 4095; /* XXX: tune */
  613. srq_attr.attr.max_wr = srq_size;
  614. srq_attr.attr.max_sge = 2;
  615. srq_attr.attr.srq_limit = 0;
  616. srq_attr.srq_type = IB_SRQT_BASIC;
  617. srq = ib_create_srq(ndev->pd, &srq_attr);
  618. if (IS_ERR(srq)) {
  619. /*
  620. * If SRQs aren't supported we just go ahead and use normal
  621. * non-shared receive queues.
  622. */
  623. pr_info("SRQ requested but not supported.\n");
  624. return 0;
  625. }
  626. ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
  627. if (IS_ERR(ndev->srq_cmds)) {
  628. ret = PTR_ERR(ndev->srq_cmds);
  629. goto out_destroy_srq;
  630. }
  631. ndev->srq = srq;
  632. ndev->srq_size = srq_size;
  633. for (i = 0; i < srq_size; i++)
  634. nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
  635. return 0;
  636. out_destroy_srq:
  637. ib_destroy_srq(srq);
  638. return ret;
  639. }
  640. static void nvmet_rdma_free_dev(struct kref *ref)
  641. {
  642. struct nvmet_rdma_device *ndev =
  643. container_of(ref, struct nvmet_rdma_device, ref);
  644. mutex_lock(&device_list_mutex);
  645. list_del(&ndev->entry);
  646. mutex_unlock(&device_list_mutex);
  647. nvmet_rdma_destroy_srq(ndev);
  648. ib_dealloc_pd(ndev->pd);
  649. kfree(ndev);
  650. }
  651. static struct nvmet_rdma_device *
  652. nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
  653. {
  654. struct nvmet_rdma_device *ndev;
  655. int ret;
  656. mutex_lock(&device_list_mutex);
  657. list_for_each_entry(ndev, &device_list, entry) {
  658. if (ndev->device->node_guid == cm_id->device->node_guid &&
  659. kref_get_unless_zero(&ndev->ref))
  660. goto out_unlock;
  661. }
  662. ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
  663. if (!ndev)
  664. goto out_err;
  665. ndev->device = cm_id->device;
  666. kref_init(&ndev->ref);
  667. ndev->pd = ib_alloc_pd(ndev->device, 0);
  668. if (IS_ERR(ndev->pd))
  669. goto out_free_dev;
  670. if (nvmet_rdma_use_srq) {
  671. ret = nvmet_rdma_init_srq(ndev);
  672. if (ret)
  673. goto out_free_pd;
  674. }
  675. list_add(&ndev->entry, &device_list);
  676. out_unlock:
  677. mutex_unlock(&device_list_mutex);
  678. pr_debug("added %s.\n", ndev->device->name);
  679. return ndev;
  680. out_free_pd:
  681. ib_dealloc_pd(ndev->pd);
  682. out_free_dev:
  683. kfree(ndev);
  684. out_err:
  685. mutex_unlock(&device_list_mutex);
  686. return NULL;
  687. }
  688. static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
  689. {
  690. struct ib_qp_init_attr qp_attr;
  691. struct nvmet_rdma_device *ndev = queue->dev;
  692. int comp_vector, nr_cqe, ret, i;
  693. /*
  694. * Spread the io queues across completion vectors,
  695. * but still keep all admin queues on vector 0.
  696. */
  697. comp_vector = !queue->host_qid ? 0 :
  698. queue->idx % ndev->device->num_comp_vectors;
  699. /*
  700. * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
  701. */
  702. nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
  703. queue->cq = ib_alloc_cq(ndev->device, queue,
  704. nr_cqe + 1, comp_vector,
  705. IB_POLL_WORKQUEUE);
  706. if (IS_ERR(queue->cq)) {
  707. ret = PTR_ERR(queue->cq);
  708. pr_err("failed to create CQ cqe= %d ret= %d\n",
  709. nr_cqe + 1, ret);
  710. goto out;
  711. }
  712. memset(&qp_attr, 0, sizeof(qp_attr));
  713. qp_attr.qp_context = queue;
  714. qp_attr.event_handler = nvmet_rdma_qp_event;
  715. qp_attr.send_cq = queue->cq;
  716. qp_attr.recv_cq = queue->cq;
  717. qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  718. qp_attr.qp_type = IB_QPT_RC;
  719. /* +1 for drain */
  720. qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
  721. qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
  722. qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
  723. ndev->device->attrs.max_sge);
  724. if (ndev->srq) {
  725. qp_attr.srq = ndev->srq;
  726. } else {
  727. /* +1 for drain */
  728. qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
  729. qp_attr.cap.max_recv_sge = 2;
  730. }
  731. ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
  732. if (ret) {
  733. pr_err("failed to create_qp ret= %d\n", ret);
  734. goto err_destroy_cq;
  735. }
  736. atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
  737. pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
  738. __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
  739. qp_attr.cap.max_send_wr, queue->cm_id);
  740. if (!ndev->srq) {
  741. for (i = 0; i < queue->recv_queue_size; i++) {
  742. queue->cmds[i].queue = queue;
  743. nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
  744. }
  745. }
  746. out:
  747. return ret;
  748. err_destroy_cq:
  749. ib_free_cq(queue->cq);
  750. goto out;
  751. }
  752. static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
  753. {
  754. struct ib_qp *qp = queue->cm_id->qp;
  755. ib_drain_qp(qp);
  756. rdma_destroy_id(queue->cm_id);
  757. ib_destroy_qp(qp);
  758. ib_free_cq(queue->cq);
  759. }
  760. static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
  761. {
  762. pr_debug("freeing queue %d\n", queue->idx);
  763. nvmet_sq_destroy(&queue->nvme_sq);
  764. nvmet_rdma_destroy_queue_ib(queue);
  765. if (!queue->dev->srq) {
  766. nvmet_rdma_free_cmds(queue->dev, queue->cmds,
  767. queue->recv_queue_size,
  768. !queue->host_qid);
  769. }
  770. nvmet_rdma_free_rsps(queue);
  771. ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
  772. kfree(queue);
  773. }
  774. static void nvmet_rdma_release_queue_work(struct work_struct *w)
  775. {
  776. struct nvmet_rdma_queue *queue =
  777. container_of(w, struct nvmet_rdma_queue, release_work);
  778. struct nvmet_rdma_device *dev = queue->dev;
  779. nvmet_rdma_free_queue(queue);
  780. kref_put(&dev->ref, nvmet_rdma_free_dev);
  781. }
  782. static int
  783. nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
  784. struct nvmet_rdma_queue *queue)
  785. {
  786. struct nvme_rdma_cm_req *req;
  787. req = (struct nvme_rdma_cm_req *)conn->private_data;
  788. if (!req || conn->private_data_len == 0)
  789. return NVME_RDMA_CM_INVALID_LEN;
  790. if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
  791. return NVME_RDMA_CM_INVALID_RECFMT;
  792. queue->host_qid = le16_to_cpu(req->qid);
  793. /*
  794. * req->hsqsize corresponds to our recv queue size plus 1
  795. * req->hrqsize corresponds to our send queue size
  796. */
  797. queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
  798. queue->send_queue_size = le16_to_cpu(req->hrqsize);
  799. if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
  800. return NVME_RDMA_CM_INVALID_HSQSIZE;
  801. /* XXX: Should we enforce some kind of max for IO queues? */
  802. return 0;
  803. }
  804. static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
  805. enum nvme_rdma_cm_status status)
  806. {
  807. struct nvme_rdma_cm_rej rej;
  808. pr_debug("rejecting connect request: status %d (%s)\n",
  809. status, nvme_rdma_cm_msg(status));
  810. rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  811. rej.sts = cpu_to_le16(status);
  812. return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
  813. }
  814. static struct nvmet_rdma_queue *
  815. nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
  816. struct rdma_cm_id *cm_id,
  817. struct rdma_cm_event *event)
  818. {
  819. struct nvmet_rdma_queue *queue;
  820. int ret;
  821. queue = kzalloc(sizeof(*queue), GFP_KERNEL);
  822. if (!queue) {
  823. ret = NVME_RDMA_CM_NO_RSC;
  824. goto out_reject;
  825. }
  826. ret = nvmet_sq_init(&queue->nvme_sq);
  827. if (ret) {
  828. ret = NVME_RDMA_CM_NO_RSC;
  829. goto out_free_queue;
  830. }
  831. ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
  832. if (ret)
  833. goto out_destroy_sq;
  834. /*
  835. * Schedules the actual release because calling rdma_destroy_id from
  836. * inside a CM callback would trigger a deadlock. (great API design..)
  837. */
  838. INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
  839. queue->dev = ndev;
  840. queue->cm_id = cm_id;
  841. spin_lock_init(&queue->state_lock);
  842. queue->state = NVMET_RDMA_Q_CONNECTING;
  843. INIT_LIST_HEAD(&queue->rsp_wait_list);
  844. INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
  845. spin_lock_init(&queue->rsp_wr_wait_lock);
  846. INIT_LIST_HEAD(&queue->free_rsps);
  847. spin_lock_init(&queue->rsps_lock);
  848. INIT_LIST_HEAD(&queue->queue_list);
  849. queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
  850. if (queue->idx < 0) {
  851. ret = NVME_RDMA_CM_NO_RSC;
  852. goto out_destroy_sq;
  853. }
  854. ret = nvmet_rdma_alloc_rsps(queue);
  855. if (ret) {
  856. ret = NVME_RDMA_CM_NO_RSC;
  857. goto out_ida_remove;
  858. }
  859. if (!ndev->srq) {
  860. queue->cmds = nvmet_rdma_alloc_cmds(ndev,
  861. queue->recv_queue_size,
  862. !queue->host_qid);
  863. if (IS_ERR(queue->cmds)) {
  864. ret = NVME_RDMA_CM_NO_RSC;
  865. goto out_free_responses;
  866. }
  867. }
  868. ret = nvmet_rdma_create_queue_ib(queue);
  869. if (ret) {
  870. pr_err("%s: creating RDMA queue failed (%d).\n",
  871. __func__, ret);
  872. ret = NVME_RDMA_CM_NO_RSC;
  873. goto out_free_cmds;
  874. }
  875. return queue;
  876. out_free_cmds:
  877. if (!ndev->srq) {
  878. nvmet_rdma_free_cmds(queue->dev, queue->cmds,
  879. queue->recv_queue_size,
  880. !queue->host_qid);
  881. }
  882. out_free_responses:
  883. nvmet_rdma_free_rsps(queue);
  884. out_ida_remove:
  885. ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
  886. out_destroy_sq:
  887. nvmet_sq_destroy(&queue->nvme_sq);
  888. out_free_queue:
  889. kfree(queue);
  890. out_reject:
  891. nvmet_rdma_cm_reject(cm_id, ret);
  892. return NULL;
  893. }
  894. static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
  895. {
  896. struct nvmet_rdma_queue *queue = priv;
  897. switch (event->event) {
  898. case IB_EVENT_COMM_EST:
  899. rdma_notify(queue->cm_id, event->event);
  900. break;
  901. default:
  902. pr_err("received IB QP event: %s (%d)\n",
  903. ib_event_msg(event->event), event->event);
  904. break;
  905. }
  906. }
  907. static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
  908. struct nvmet_rdma_queue *queue,
  909. struct rdma_conn_param *p)
  910. {
  911. struct rdma_conn_param param = { };
  912. struct nvme_rdma_cm_rep priv = { };
  913. int ret = -ENOMEM;
  914. param.rnr_retry_count = 7;
  915. param.flow_control = 1;
  916. param.initiator_depth = min_t(u8, p->initiator_depth,
  917. queue->dev->device->attrs.max_qp_init_rd_atom);
  918. param.private_data = &priv;
  919. param.private_data_len = sizeof(priv);
  920. priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  921. priv.crqsize = cpu_to_le16(queue->recv_queue_size);
  922. ret = rdma_accept(cm_id, &param);
  923. if (ret)
  924. pr_err("rdma_accept failed (error code = %d)\n", ret);
  925. return ret;
  926. }
  927. static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
  928. struct rdma_cm_event *event)
  929. {
  930. struct nvmet_rdma_device *ndev;
  931. struct nvmet_rdma_queue *queue;
  932. int ret = -EINVAL;
  933. ndev = nvmet_rdma_find_get_device(cm_id);
  934. if (!ndev) {
  935. nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
  936. return -ECONNREFUSED;
  937. }
  938. queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
  939. if (!queue) {
  940. ret = -ENOMEM;
  941. goto put_device;
  942. }
  943. queue->port = cm_id->context;
  944. if (queue->host_qid == 0) {
  945. /* Let inflight controller teardown complete */
  946. flush_scheduled_work();
  947. }
  948. ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
  949. if (ret) {
  950. schedule_work(&queue->release_work);
  951. /* Destroying rdma_cm id is not needed here */
  952. return 0;
  953. }
  954. mutex_lock(&nvmet_rdma_queue_mutex);
  955. list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
  956. mutex_unlock(&nvmet_rdma_queue_mutex);
  957. return 0;
  958. put_device:
  959. kref_put(&ndev->ref, nvmet_rdma_free_dev);
  960. return ret;
  961. }
  962. static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
  963. {
  964. unsigned long flags;
  965. spin_lock_irqsave(&queue->state_lock, flags);
  966. if (queue->state != NVMET_RDMA_Q_CONNECTING) {
  967. pr_warn("trying to establish a connected queue\n");
  968. goto out_unlock;
  969. }
  970. queue->state = NVMET_RDMA_Q_LIVE;
  971. while (!list_empty(&queue->rsp_wait_list)) {
  972. struct nvmet_rdma_rsp *cmd;
  973. cmd = list_first_entry(&queue->rsp_wait_list,
  974. struct nvmet_rdma_rsp, wait_list);
  975. list_del(&cmd->wait_list);
  976. spin_unlock_irqrestore(&queue->state_lock, flags);
  977. nvmet_rdma_handle_command(queue, cmd);
  978. spin_lock_irqsave(&queue->state_lock, flags);
  979. }
  980. out_unlock:
  981. spin_unlock_irqrestore(&queue->state_lock, flags);
  982. }
  983. static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
  984. {
  985. bool disconnect = false;
  986. unsigned long flags;
  987. pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
  988. spin_lock_irqsave(&queue->state_lock, flags);
  989. switch (queue->state) {
  990. case NVMET_RDMA_Q_CONNECTING:
  991. case NVMET_RDMA_Q_LIVE:
  992. queue->state = NVMET_RDMA_Q_DISCONNECTING;
  993. disconnect = true;
  994. break;
  995. case NVMET_RDMA_Q_DISCONNECTING:
  996. break;
  997. }
  998. spin_unlock_irqrestore(&queue->state_lock, flags);
  999. if (disconnect) {
  1000. rdma_disconnect(queue->cm_id);
  1001. schedule_work(&queue->release_work);
  1002. }
  1003. }
  1004. static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
  1005. {
  1006. bool disconnect = false;
  1007. mutex_lock(&nvmet_rdma_queue_mutex);
  1008. if (!list_empty(&queue->queue_list)) {
  1009. list_del_init(&queue->queue_list);
  1010. disconnect = true;
  1011. }
  1012. mutex_unlock(&nvmet_rdma_queue_mutex);
  1013. if (disconnect)
  1014. __nvmet_rdma_queue_disconnect(queue);
  1015. }
  1016. static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
  1017. struct nvmet_rdma_queue *queue)
  1018. {
  1019. WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
  1020. mutex_lock(&nvmet_rdma_queue_mutex);
  1021. if (!list_empty(&queue->queue_list))
  1022. list_del_init(&queue->queue_list);
  1023. mutex_unlock(&nvmet_rdma_queue_mutex);
  1024. pr_err("failed to connect queue %d\n", queue->idx);
  1025. schedule_work(&queue->release_work);
  1026. }
  1027. /**
  1028. * nvme_rdma_device_removal() - Handle RDMA device removal
  1029. * @cm_id: rdma_cm id, used for nvmet port
  1030. * @queue: nvmet rdma queue (cm id qp_context)
  1031. *
  1032. * DEVICE_REMOVAL event notifies us that the RDMA device is about
  1033. * to unplug. Note that this event can be generated on a normal
  1034. * queue cm_id and/or a device bound listener cm_id (where in this
  1035. * case queue will be null).
  1036. *
  1037. * We registered an ib_client to handle device removal for queues,
  1038. * so we only need to handle the listening port cm_ids. In this case
  1039. * we nullify the priv to prevent double cm_id destruction and destroying
  1040. * the cm_id implicitely by returning a non-zero rc to the callout.
  1041. */
  1042. static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
  1043. struct nvmet_rdma_queue *queue)
  1044. {
  1045. struct nvmet_port *port;
  1046. if (queue) {
  1047. /*
  1048. * This is a queue cm_id. we have registered
  1049. * an ib_client to handle queues removal
  1050. * so don't interfear and just return.
  1051. */
  1052. return 0;
  1053. }
  1054. port = cm_id->context;
  1055. /*
  1056. * This is a listener cm_id. Make sure that
  1057. * future remove_port won't invoke a double
  1058. * cm_id destroy. use atomic xchg to make sure
  1059. * we don't compete with remove_port.
  1060. */
  1061. if (xchg(&port->priv, NULL) != cm_id)
  1062. return 0;
  1063. /*
  1064. * We need to return 1 so that the core will destroy
  1065. * it's own ID. What a great API design..
  1066. */
  1067. return 1;
  1068. }
  1069. static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
  1070. struct rdma_cm_event *event)
  1071. {
  1072. struct nvmet_rdma_queue *queue = NULL;
  1073. int ret = 0;
  1074. if (cm_id->qp)
  1075. queue = cm_id->qp->qp_context;
  1076. pr_debug("%s (%d): status %d id %p\n",
  1077. rdma_event_msg(event->event), event->event,
  1078. event->status, cm_id);
  1079. switch (event->event) {
  1080. case RDMA_CM_EVENT_CONNECT_REQUEST:
  1081. ret = nvmet_rdma_queue_connect(cm_id, event);
  1082. break;
  1083. case RDMA_CM_EVENT_ESTABLISHED:
  1084. nvmet_rdma_queue_established(queue);
  1085. break;
  1086. case RDMA_CM_EVENT_ADDR_CHANGE:
  1087. case RDMA_CM_EVENT_DISCONNECTED:
  1088. case RDMA_CM_EVENT_TIMEWAIT_EXIT:
  1089. nvmet_rdma_queue_disconnect(queue);
  1090. break;
  1091. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  1092. ret = nvmet_rdma_device_removal(cm_id, queue);
  1093. break;
  1094. case RDMA_CM_EVENT_REJECTED:
  1095. pr_debug("Connection rejected: %s\n",
  1096. rdma_reject_msg(cm_id, event->status));
  1097. /* FALLTHROUGH */
  1098. case RDMA_CM_EVENT_UNREACHABLE:
  1099. case RDMA_CM_EVENT_CONNECT_ERROR:
  1100. nvmet_rdma_queue_connect_fail(cm_id, queue);
  1101. break;
  1102. default:
  1103. pr_err("received unrecognized RDMA CM event %d\n",
  1104. event->event);
  1105. break;
  1106. }
  1107. return ret;
  1108. }
  1109. static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
  1110. {
  1111. struct nvmet_rdma_queue *queue;
  1112. restart:
  1113. mutex_lock(&nvmet_rdma_queue_mutex);
  1114. list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
  1115. if (queue->nvme_sq.ctrl == ctrl) {
  1116. list_del_init(&queue->queue_list);
  1117. mutex_unlock(&nvmet_rdma_queue_mutex);
  1118. __nvmet_rdma_queue_disconnect(queue);
  1119. goto restart;
  1120. }
  1121. }
  1122. mutex_unlock(&nvmet_rdma_queue_mutex);
  1123. }
  1124. static int nvmet_rdma_add_port(struct nvmet_port *port)
  1125. {
  1126. struct rdma_cm_id *cm_id;
  1127. struct sockaddr_storage addr = { };
  1128. __kernel_sa_family_t af;
  1129. int ret;
  1130. switch (port->disc_addr.adrfam) {
  1131. case NVMF_ADDR_FAMILY_IP4:
  1132. af = AF_INET;
  1133. break;
  1134. case NVMF_ADDR_FAMILY_IP6:
  1135. af = AF_INET6;
  1136. break;
  1137. default:
  1138. pr_err("address family %d not supported\n",
  1139. port->disc_addr.adrfam);
  1140. return -EINVAL;
  1141. }
  1142. ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
  1143. port->disc_addr.trsvcid, &addr);
  1144. if (ret) {
  1145. pr_err("malformed ip/port passed: %s:%s\n",
  1146. port->disc_addr.traddr, port->disc_addr.trsvcid);
  1147. return ret;
  1148. }
  1149. cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
  1150. RDMA_PS_TCP, IB_QPT_RC);
  1151. if (IS_ERR(cm_id)) {
  1152. pr_err("CM ID creation failed\n");
  1153. return PTR_ERR(cm_id);
  1154. }
  1155. /*
  1156. * Allow both IPv4 and IPv6 sockets to bind a single port
  1157. * at the same time.
  1158. */
  1159. ret = rdma_set_afonly(cm_id, 1);
  1160. if (ret) {
  1161. pr_err("rdma_set_afonly failed (%d)\n", ret);
  1162. goto out_destroy_id;
  1163. }
  1164. ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
  1165. if (ret) {
  1166. pr_err("binding CM ID to %pISpcs failed (%d)\n",
  1167. (struct sockaddr *)&addr, ret);
  1168. goto out_destroy_id;
  1169. }
  1170. ret = rdma_listen(cm_id, 128);
  1171. if (ret) {
  1172. pr_err("listening to %pISpcs failed (%d)\n",
  1173. (struct sockaddr *)&addr, ret);
  1174. goto out_destroy_id;
  1175. }
  1176. pr_info("enabling port %d (%pISpcs)\n",
  1177. le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
  1178. port->priv = cm_id;
  1179. return 0;
  1180. out_destroy_id:
  1181. rdma_destroy_id(cm_id);
  1182. return ret;
  1183. }
  1184. static void nvmet_rdma_remove_port(struct nvmet_port *port)
  1185. {
  1186. struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
  1187. if (cm_id)
  1188. rdma_destroy_id(cm_id);
  1189. }
  1190. static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
  1191. struct nvmet_port *port, char *traddr)
  1192. {
  1193. struct rdma_cm_id *cm_id = port->priv;
  1194. if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
  1195. struct nvmet_rdma_rsp *rsp =
  1196. container_of(req, struct nvmet_rdma_rsp, req);
  1197. struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
  1198. struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
  1199. sprintf(traddr, "%pISc", addr);
  1200. } else {
  1201. memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
  1202. }
  1203. }
  1204. static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
  1205. .owner = THIS_MODULE,
  1206. .type = NVMF_TRTYPE_RDMA,
  1207. .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE,
  1208. .msdbd = 1,
  1209. .has_keyed_sgls = 1,
  1210. .add_port = nvmet_rdma_add_port,
  1211. .remove_port = nvmet_rdma_remove_port,
  1212. .queue_response = nvmet_rdma_queue_response,
  1213. .delete_ctrl = nvmet_rdma_delete_ctrl,
  1214. .disc_traddr = nvmet_rdma_disc_port_addr,
  1215. };
  1216. static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
  1217. {
  1218. struct nvmet_rdma_queue *queue, *tmp;
  1219. struct nvmet_rdma_device *ndev;
  1220. bool found = false;
  1221. mutex_lock(&device_list_mutex);
  1222. list_for_each_entry(ndev, &device_list, entry) {
  1223. if (ndev->device == ib_device) {
  1224. found = true;
  1225. break;
  1226. }
  1227. }
  1228. mutex_unlock(&device_list_mutex);
  1229. if (!found)
  1230. return;
  1231. /*
  1232. * IB Device that is used by nvmet controllers is being removed,
  1233. * delete all queues using this device.
  1234. */
  1235. mutex_lock(&nvmet_rdma_queue_mutex);
  1236. list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
  1237. queue_list) {
  1238. if (queue->dev->device != ib_device)
  1239. continue;
  1240. pr_info("Removing queue %d\n", queue->idx);
  1241. list_del_init(&queue->queue_list);
  1242. __nvmet_rdma_queue_disconnect(queue);
  1243. }
  1244. mutex_unlock(&nvmet_rdma_queue_mutex);
  1245. flush_scheduled_work();
  1246. }
  1247. static struct ib_client nvmet_rdma_ib_client = {
  1248. .name = "nvmet_rdma",
  1249. .remove = nvmet_rdma_remove_one
  1250. };
  1251. static int __init nvmet_rdma_init(void)
  1252. {
  1253. int ret;
  1254. ret = ib_register_client(&nvmet_rdma_ib_client);
  1255. if (ret)
  1256. return ret;
  1257. ret = nvmet_register_transport(&nvmet_rdma_ops);
  1258. if (ret)
  1259. goto err_ib_client;
  1260. return 0;
  1261. err_ib_client:
  1262. ib_unregister_client(&nvmet_rdma_ib_client);
  1263. return ret;
  1264. }
  1265. static void __exit nvmet_rdma_exit(void)
  1266. {
  1267. nvmet_unregister_transport(&nvmet_rdma_ops);
  1268. ib_unregister_client(&nvmet_rdma_ib_client);
  1269. WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
  1270. ida_destroy(&nvmet_rdma_queue_ida);
  1271. }
  1272. module_init(nvmet_rdma_init);
  1273. module_exit(nvmet_rdma_exit);
  1274. MODULE_LICENSE("GPL v2");
  1275. MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */