user_sdma.c 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524
  1. /*
  2. * Copyright(c) 2015 - 2018 Intel Corporation.
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of version 2 of the GNU General Public License as
  11. * published by the Free Software Foundation.
  12. *
  13. * This program is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * General Public License for more details.
  17. *
  18. * BSD LICENSE
  19. *
  20. * Redistribution and use in source and binary forms, with or without
  21. * modification, are permitted provided that the following conditions
  22. * are met:
  23. *
  24. * - Redistributions of source code must retain the above copyright
  25. * notice, this list of conditions and the following disclaimer.
  26. * - Redistributions in binary form must reproduce the above copyright
  27. * notice, this list of conditions and the following disclaimer in
  28. * the documentation and/or other materials provided with the
  29. * distribution.
  30. * - Neither the name of Intel Corporation nor the names of its
  31. * contributors may be used to endorse or promote products derived
  32. * from this software without specific prior written permission.
  33. *
  34. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45. *
  46. */
  47. #include <linux/mm.h>
  48. #include <linux/types.h>
  49. #include <linux/device.h>
  50. #include <linux/dmapool.h>
  51. #include <linux/slab.h>
  52. #include <linux/list.h>
  53. #include <linux/highmem.h>
  54. #include <linux/io.h>
  55. #include <linux/uio.h>
  56. #include <linux/rbtree.h>
  57. #include <linux/spinlock.h>
  58. #include <linux/delay.h>
  59. #include <linux/kthread.h>
  60. #include <linux/mmu_context.h>
  61. #include <linux/module.h>
  62. #include <linux/vmalloc.h>
  63. #include <linux/string.h>
  64. #include "hfi.h"
  65. #include "sdma.h"
  66. #include "mmu_rb.h"
  67. #include "user_sdma.h"
  68. #include "verbs.h" /* for the headers */
  69. #include "common.h" /* for struct hfi1_tid_info */
  70. #include "trace.h"
  71. static uint hfi1_sdma_comp_ring_size = 128;
  72. module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  73. MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  74. static unsigned initial_pkt_count = 8;
  75. static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  76. static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  77. static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  78. static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  79. static int pin_vector_pages(struct user_sdma_request *req,
  80. struct user_sdma_iovec *iovec);
  81. static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  82. unsigned start, unsigned npages);
  83. static int check_header_template(struct user_sdma_request *req,
  84. struct hfi1_pkt_header *hdr, u32 lrhlen,
  85. u32 datalen);
  86. static int set_txreq_header(struct user_sdma_request *req,
  87. struct user_sdma_txreq *tx, u32 datalen);
  88. static int set_txreq_header_ahg(struct user_sdma_request *req,
  89. struct user_sdma_txreq *tx, u32 len);
  90. static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  91. struct hfi1_user_sdma_comp_q *cq,
  92. u16 idx, enum hfi1_sdma_comp_state state,
  93. int ret);
  94. static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  95. static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  96. static int defer_packet_queue(
  97. struct sdma_engine *sde,
  98. struct iowait_work *wait,
  99. struct sdma_txreq *txreq,
  100. uint seq,
  101. bool pkts_sent);
  102. static void activate_packet_queue(struct iowait *wait, int reason);
  103. static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  104. unsigned long len);
  105. static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
  106. static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  107. void *arg2, bool *stop);
  108. static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
  109. static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
  110. static struct mmu_rb_ops sdma_rb_ops = {
  111. .filter = sdma_rb_filter,
  112. .insert = sdma_rb_insert,
  113. .evict = sdma_rb_evict,
  114. .remove = sdma_rb_remove,
  115. .invalidate = sdma_rb_invalidate
  116. };
  117. static int defer_packet_queue(
  118. struct sdma_engine *sde,
  119. struct iowait_work *wait,
  120. struct sdma_txreq *txreq,
  121. uint seq,
  122. bool pkts_sent)
  123. {
  124. struct hfi1_user_sdma_pkt_q *pq =
  125. container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  126. struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
  127. struct user_sdma_txreq *tx =
  128. container_of(txreq, struct user_sdma_txreq, txreq);
  129. if (sdma_progress(sde, seq, txreq)) {
  130. if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
  131. goto eagain;
  132. }
  133. /*
  134. * We are assuming that if the list is enqueued somewhere, it
  135. * is to the dmawait list since that is the only place where
  136. * it is supposed to be enqueued.
  137. */
  138. xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  139. write_seqlock(&dev->iowait_lock);
  140. if (list_empty(&pq->busy.list))
  141. iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  142. write_sequnlock(&dev->iowait_lock);
  143. return -EBUSY;
  144. eagain:
  145. return -EAGAIN;
  146. }
  147. static void activate_packet_queue(struct iowait *wait, int reason)
  148. {
  149. struct hfi1_user_sdma_pkt_q *pq =
  150. container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
  151. xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
  152. wake_up(&wait->wait_dma);
  153. };
  154. int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
  155. struct hfi1_filedata *fd)
  156. {
  157. int ret = -ENOMEM;
  158. char buf[64];
  159. struct hfi1_devdata *dd;
  160. struct hfi1_user_sdma_comp_q *cq;
  161. struct hfi1_user_sdma_pkt_q *pq;
  162. if (!uctxt || !fd)
  163. return -EBADF;
  164. if (!hfi1_sdma_comp_ring_size)
  165. return -EINVAL;
  166. dd = uctxt->dd;
  167. pq = kzalloc(sizeof(*pq), GFP_KERNEL);
  168. if (!pq)
  169. return -ENOMEM;
  170. pq->dd = dd;
  171. pq->ctxt = uctxt->ctxt;
  172. pq->subctxt = fd->subctxt;
  173. pq->n_max_reqs = hfi1_sdma_comp_ring_size;
  174. atomic_set(&pq->n_reqs, 0);
  175. init_waitqueue_head(&pq->wait);
  176. atomic_set(&pq->n_locked, 0);
  177. pq->mm = fd->mm;
  178. iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
  179. activate_packet_queue, NULL);
  180. pq->reqidx = 0;
  181. pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
  182. sizeof(*pq->reqs),
  183. GFP_KERNEL);
  184. if (!pq->reqs)
  185. goto pq_reqs_nomem;
  186. pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
  187. sizeof(*pq->req_in_use),
  188. GFP_KERNEL);
  189. if (!pq->req_in_use)
  190. goto pq_reqs_no_in_use;
  191. snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
  192. fd->subctxt);
  193. pq->txreq_cache = kmem_cache_create(buf,
  194. sizeof(struct user_sdma_txreq),
  195. L1_CACHE_BYTES,
  196. SLAB_HWCACHE_ALIGN,
  197. NULL);
  198. if (!pq->txreq_cache) {
  199. dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
  200. uctxt->ctxt);
  201. goto pq_txreq_nomem;
  202. }
  203. cq = kzalloc(sizeof(*cq), GFP_KERNEL);
  204. if (!cq)
  205. goto cq_nomem;
  206. cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
  207. * hfi1_sdma_comp_ring_size));
  208. if (!cq->comps)
  209. goto cq_comps_nomem;
  210. cq->nentries = hfi1_sdma_comp_ring_size;
  211. ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
  212. &pq->handler);
  213. if (ret) {
  214. dd_dev_err(dd, "Failed to register with MMU %d", ret);
  215. goto pq_mmu_fail;
  216. }
  217. fd->pq = pq;
  218. fd->cq = cq;
  219. return 0;
  220. pq_mmu_fail:
  221. vfree(cq->comps);
  222. cq_comps_nomem:
  223. kfree(cq);
  224. cq_nomem:
  225. kmem_cache_destroy(pq->txreq_cache);
  226. pq_txreq_nomem:
  227. kfree(pq->req_in_use);
  228. pq_reqs_no_in_use:
  229. kfree(pq->reqs);
  230. pq_reqs_nomem:
  231. kfree(pq);
  232. return ret;
  233. }
  234. int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
  235. struct hfi1_ctxtdata *uctxt)
  236. {
  237. struct hfi1_user_sdma_pkt_q *pq;
  238. trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
  239. pq = fd->pq;
  240. if (pq) {
  241. if (pq->handler)
  242. hfi1_mmu_rb_unregister(pq->handler);
  243. iowait_sdma_drain(&pq->busy);
  244. /* Wait until all requests have been freed. */
  245. wait_event_interruptible(
  246. pq->wait,
  247. !atomic_read(&pq->n_reqs));
  248. kfree(pq->reqs);
  249. kfree(pq->req_in_use);
  250. kmem_cache_destroy(pq->txreq_cache);
  251. kfree(pq);
  252. fd->pq = NULL;
  253. }
  254. if (fd->cq) {
  255. vfree(fd->cq->comps);
  256. kfree(fd->cq);
  257. fd->cq = NULL;
  258. }
  259. return 0;
  260. }
  261. static u8 dlid_to_selector(u16 dlid)
  262. {
  263. static u8 mapping[256];
  264. static int initialized;
  265. static u8 next;
  266. int hash;
  267. if (!initialized) {
  268. memset(mapping, 0xFF, 256);
  269. initialized = 1;
  270. }
  271. hash = ((dlid >> 8) ^ dlid) & 0xFF;
  272. if (mapping[hash] == 0xFF) {
  273. mapping[hash] = next;
  274. next = (next + 1) & 0x7F;
  275. }
  276. return mapping[hash];
  277. }
  278. /**
  279. * hfi1_user_sdma_process_request() - Process and start a user sdma request
  280. * @fd: valid file descriptor
  281. * @iovec: array of io vectors to process
  282. * @dim: overall iovec array size
  283. * @count: number of io vector array entries processed
  284. */
  285. int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
  286. struct iovec *iovec, unsigned long dim,
  287. unsigned long *count)
  288. {
  289. int ret = 0, i;
  290. struct hfi1_ctxtdata *uctxt = fd->uctxt;
  291. struct hfi1_user_sdma_pkt_q *pq = fd->pq;
  292. struct hfi1_user_sdma_comp_q *cq = fd->cq;
  293. struct hfi1_devdata *dd = pq->dd;
  294. unsigned long idx = 0;
  295. u8 pcount = initial_pkt_count;
  296. struct sdma_req_info info;
  297. struct user_sdma_request *req;
  298. u8 opcode, sc, vl;
  299. u16 pkey;
  300. u32 slid;
  301. u16 dlid;
  302. u32 selector;
  303. if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
  304. hfi1_cdbg(
  305. SDMA,
  306. "[%u:%u:%u] First vector not big enough for header %lu/%lu",
  307. dd->unit, uctxt->ctxt, fd->subctxt,
  308. iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
  309. return -EINVAL;
  310. }
  311. ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
  312. if (ret) {
  313. hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
  314. dd->unit, uctxt->ctxt, fd->subctxt, ret);
  315. return -EFAULT;
  316. }
  317. trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
  318. (u16 *)&info);
  319. if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
  320. hfi1_cdbg(SDMA,
  321. "[%u:%u:%u:%u] Invalid comp index",
  322. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
  323. return -EINVAL;
  324. }
  325. /*
  326. * Sanity check the header io vector count. Need at least 1 vector
  327. * (header) and cannot be larger than the actual io vector count.
  328. */
  329. if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
  330. hfi1_cdbg(SDMA,
  331. "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
  332. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
  333. req_iovcnt(info.ctrl), dim);
  334. return -EINVAL;
  335. }
  336. if (!info.fragsize) {
  337. hfi1_cdbg(SDMA,
  338. "[%u:%u:%u:%u] Request does not specify fragsize",
  339. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
  340. return -EINVAL;
  341. }
  342. /* Try to claim the request. */
  343. if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
  344. hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
  345. dd->unit, uctxt->ctxt, fd->subctxt,
  346. info.comp_idx);
  347. return -EBADSLT;
  348. }
  349. /*
  350. * All safety checks have been done and this request has been claimed.
  351. */
  352. trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
  353. info.comp_idx);
  354. req = pq->reqs + info.comp_idx;
  355. req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
  356. req->data_len = 0;
  357. req->pq = pq;
  358. req->cq = cq;
  359. req->ahg_idx = -1;
  360. req->iov_idx = 0;
  361. req->sent = 0;
  362. req->seqnum = 0;
  363. req->seqcomp = 0;
  364. req->seqsubmitted = 0;
  365. req->tids = NULL;
  366. req->has_error = 0;
  367. INIT_LIST_HEAD(&req->txps);
  368. memcpy(&req->info, &info, sizeof(info));
  369. /* The request is initialized, count it */
  370. atomic_inc(&pq->n_reqs);
  371. if (req_opcode(info.ctrl) == EXPECTED) {
  372. /* expected must have a TID info and at least one data vector */
  373. if (req->data_iovs < 2) {
  374. SDMA_DBG(req,
  375. "Not enough vectors for expected request");
  376. ret = -EINVAL;
  377. goto free_req;
  378. }
  379. req->data_iovs--;
  380. }
  381. if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
  382. SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
  383. MAX_VECTORS_PER_REQ);
  384. ret = -EINVAL;
  385. goto free_req;
  386. }
  387. /* Copy the header from the user buffer */
  388. ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
  389. sizeof(req->hdr));
  390. if (ret) {
  391. SDMA_DBG(req, "Failed to copy header template (%d)", ret);
  392. ret = -EFAULT;
  393. goto free_req;
  394. }
  395. /* If Static rate control is not enabled, sanitize the header. */
  396. if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
  397. req->hdr.pbc[2] = 0;
  398. /* Validate the opcode. Do not trust packets from user space blindly. */
  399. opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
  400. if ((opcode & USER_OPCODE_CHECK_MASK) !=
  401. USER_OPCODE_CHECK_VAL) {
  402. SDMA_DBG(req, "Invalid opcode (%d)", opcode);
  403. ret = -EINVAL;
  404. goto free_req;
  405. }
  406. /*
  407. * Validate the vl. Do not trust packets from user space blindly.
  408. * VL comes from PBC, SC comes from LRH, and the VL needs to
  409. * match the SC look up.
  410. */
  411. vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
  412. sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
  413. (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
  414. if (vl >= dd->pport->vls_operational ||
  415. vl != sc_to_vlt(dd, sc)) {
  416. SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
  417. ret = -EINVAL;
  418. goto free_req;
  419. }
  420. /* Checking P_KEY for requests from user-space */
  421. pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
  422. slid = be16_to_cpu(req->hdr.lrh[3]);
  423. if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
  424. ret = -EINVAL;
  425. goto free_req;
  426. }
  427. /*
  428. * Also should check the BTH.lnh. If it says the next header is GRH then
  429. * the RXE parsing will be off and will land in the middle of the KDETH
  430. * or miss it entirely.
  431. */
  432. if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
  433. SDMA_DBG(req, "User tried to pass in a GRH");
  434. ret = -EINVAL;
  435. goto free_req;
  436. }
  437. req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
  438. /*
  439. * Calculate the initial TID offset based on the values of
  440. * KDETH.OFFSET and KDETH.OM that are passed in.
  441. */
  442. req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
  443. (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
  444. KDETH_OM_LARGE : KDETH_OM_SMALL);
  445. trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
  446. info.comp_idx, req->tidoffset);
  447. idx++;
  448. /* Save all the IO vector structures */
  449. for (i = 0; i < req->data_iovs; i++) {
  450. req->iovs[i].offset = 0;
  451. INIT_LIST_HEAD(&req->iovs[i].list);
  452. memcpy(&req->iovs[i].iov,
  453. iovec + idx++,
  454. sizeof(req->iovs[i].iov));
  455. ret = pin_vector_pages(req, &req->iovs[i]);
  456. if (ret) {
  457. req->data_iovs = i;
  458. goto free_req;
  459. }
  460. req->data_len += req->iovs[i].iov.iov_len;
  461. }
  462. trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
  463. info.comp_idx, req->data_len);
  464. if (pcount > req->info.npkts)
  465. pcount = req->info.npkts;
  466. /*
  467. * Copy any TID info
  468. * User space will provide the TID info only when the
  469. * request type is EXPECTED. This is true even if there is
  470. * only one packet in the request and the header is already
  471. * setup. The reason for the singular TID case is that the
  472. * driver needs to perform safety checks.
  473. */
  474. if (req_opcode(req->info.ctrl) == EXPECTED) {
  475. u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
  476. u32 *tmp;
  477. if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
  478. ret = -EINVAL;
  479. goto free_req;
  480. }
  481. /*
  482. * We have to copy all of the tids because they may vary
  483. * in size and, therefore, the TID count might not be
  484. * equal to the pkt count. However, there is no way to
  485. * tell at this point.
  486. */
  487. tmp = memdup_user(iovec[idx].iov_base,
  488. ntids * sizeof(*req->tids));
  489. if (IS_ERR(tmp)) {
  490. ret = PTR_ERR(tmp);
  491. SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
  492. ntids, ret);
  493. goto free_req;
  494. }
  495. req->tids = tmp;
  496. req->n_tids = ntids;
  497. req->tididx = 0;
  498. idx++;
  499. }
  500. dlid = be16_to_cpu(req->hdr.lrh[1]);
  501. selector = dlid_to_selector(dlid);
  502. selector += uctxt->ctxt + fd->subctxt;
  503. req->sde = sdma_select_user_engine(dd, selector, vl);
  504. if (!req->sde || !sdma_running(req->sde)) {
  505. ret = -ECOMM;
  506. goto free_req;
  507. }
  508. /* We don't need an AHG entry if the request contains only one packet */
  509. if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
  510. req->ahg_idx = sdma_ahg_alloc(req->sde);
  511. set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
  512. pq->state = SDMA_PKT_Q_ACTIVE;
  513. /* Send the first N packets in the request to buy us some time */
  514. ret = user_sdma_send_pkts(req, pcount);
  515. if (unlikely(ret < 0 && ret != -EBUSY))
  516. goto free_req;
  517. /*
  518. * This is a somewhat blocking send implementation.
  519. * The driver will block the caller until all packets of the
  520. * request have been submitted to the SDMA engine. However, it
  521. * will not wait for send completions.
  522. */
  523. while (req->seqsubmitted != req->info.npkts) {
  524. ret = user_sdma_send_pkts(req, pcount);
  525. if (ret < 0) {
  526. if (ret != -EBUSY)
  527. goto free_req;
  528. wait_event_interruptible_timeout(
  529. pq->busy.wait_dma,
  530. (pq->state == SDMA_PKT_Q_ACTIVE),
  531. msecs_to_jiffies(
  532. SDMA_IOWAIT_TIMEOUT));
  533. }
  534. }
  535. *count += idx;
  536. return 0;
  537. free_req:
  538. /*
  539. * If the submitted seqsubmitted == npkts, the completion routine
  540. * controls the final state. If sequbmitted < npkts, wait for any
  541. * outstanding packets to finish before cleaning up.
  542. */
  543. if (req->seqsubmitted < req->info.npkts) {
  544. if (req->seqsubmitted)
  545. wait_event(pq->busy.wait_dma,
  546. (req->seqcomp == req->seqsubmitted - 1));
  547. user_sdma_free_request(req, true);
  548. pq_update(pq);
  549. set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
  550. }
  551. return ret;
  552. }
  553. static inline u32 compute_data_length(struct user_sdma_request *req,
  554. struct user_sdma_txreq *tx)
  555. {
  556. /*
  557. * Determine the proper size of the packet data.
  558. * The size of the data of the first packet is in the header
  559. * template. However, it includes the header and ICRC, which need
  560. * to be subtracted.
  561. * The minimum representable packet data length in a header is 4 bytes,
  562. * therefore, when the data length request is less than 4 bytes, there's
  563. * only one packet, and the packet data length is equal to that of the
  564. * request data length.
  565. * The size of the remaining packets is the minimum of the frag
  566. * size (MTU) or remaining data in the request.
  567. */
  568. u32 len;
  569. if (!req->seqnum) {
  570. if (req->data_len < sizeof(u32))
  571. len = req->data_len;
  572. else
  573. len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
  574. (sizeof(tx->hdr) - 4));
  575. } else if (req_opcode(req->info.ctrl) == EXPECTED) {
  576. u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
  577. PAGE_SIZE;
  578. /*
  579. * Get the data length based on the remaining space in the
  580. * TID pair.
  581. */
  582. len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
  583. /* If we've filled up the TID pair, move to the next one. */
  584. if (unlikely(!len) && ++req->tididx < req->n_tids &&
  585. req->tids[req->tididx]) {
  586. tidlen = EXP_TID_GET(req->tids[req->tididx],
  587. LEN) * PAGE_SIZE;
  588. req->tidoffset = 0;
  589. len = min_t(u32, tidlen, req->info.fragsize);
  590. }
  591. /*
  592. * Since the TID pairs map entire pages, make sure that we
  593. * are not going to try to send more data that we have
  594. * remaining.
  595. */
  596. len = min(len, req->data_len - req->sent);
  597. } else {
  598. len = min(req->data_len - req->sent, (u32)req->info.fragsize);
  599. }
  600. trace_hfi1_sdma_user_compute_length(req->pq->dd,
  601. req->pq->ctxt,
  602. req->pq->subctxt,
  603. req->info.comp_idx,
  604. len);
  605. return len;
  606. }
  607. static inline u32 pad_len(u32 len)
  608. {
  609. if (len & (sizeof(u32) - 1))
  610. len += sizeof(u32) - (len & (sizeof(u32) - 1));
  611. return len;
  612. }
  613. static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
  614. {
  615. /* (Size of complete header - size of PBC) + 4B ICRC + data length */
  616. return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
  617. }
  618. static int user_sdma_txadd_ahg(struct user_sdma_request *req,
  619. struct user_sdma_txreq *tx,
  620. u32 datalen)
  621. {
  622. int ret;
  623. u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
  624. u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
  625. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  626. /*
  627. * Copy the request header into the tx header
  628. * because the HW needs a cacheline-aligned
  629. * address.
  630. * This copy can be optimized out if the hdr
  631. * member of user_sdma_request were also
  632. * cacheline aligned.
  633. */
  634. memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
  635. if (PBC2LRH(pbclen) != lrhlen) {
  636. pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
  637. tx->hdr.pbc[0] = cpu_to_le16(pbclen);
  638. }
  639. ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
  640. if (ret)
  641. return ret;
  642. ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
  643. sizeof(tx->hdr) + datalen, req->ahg_idx,
  644. 0, NULL, 0, user_sdma_txreq_cb);
  645. if (ret)
  646. return ret;
  647. ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
  648. if (ret)
  649. sdma_txclean(pq->dd, &tx->txreq);
  650. return ret;
  651. }
  652. static int user_sdma_txadd(struct user_sdma_request *req,
  653. struct user_sdma_txreq *tx,
  654. struct user_sdma_iovec *iovec, u32 datalen,
  655. u32 *queued_ptr, u32 *data_sent_ptr,
  656. u64 *iov_offset_ptr)
  657. {
  658. int ret;
  659. unsigned int pageidx, len;
  660. unsigned long base, offset;
  661. u64 iov_offset = *iov_offset_ptr;
  662. u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
  663. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  664. base = (unsigned long)iovec->iov.iov_base;
  665. offset = offset_in_page(base + iovec->offset + iov_offset);
  666. pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
  667. PAGE_SHIFT);
  668. len = offset + req->info.fragsize > PAGE_SIZE ?
  669. PAGE_SIZE - offset : req->info.fragsize;
  670. len = min((datalen - queued), len);
  671. ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
  672. offset, len);
  673. if (ret) {
  674. SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
  675. return ret;
  676. }
  677. iov_offset += len;
  678. queued += len;
  679. data_sent += len;
  680. if (unlikely(queued < datalen && pageidx == iovec->npages &&
  681. req->iov_idx < req->data_iovs - 1)) {
  682. iovec->offset += iov_offset;
  683. iovec = &req->iovs[++req->iov_idx];
  684. iov_offset = 0;
  685. }
  686. *queued_ptr = queued;
  687. *data_sent_ptr = data_sent;
  688. *iov_offset_ptr = iov_offset;
  689. return ret;
  690. }
  691. static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
  692. {
  693. int ret = 0;
  694. u16 count;
  695. unsigned npkts = 0;
  696. struct user_sdma_txreq *tx = NULL;
  697. struct hfi1_user_sdma_pkt_q *pq = NULL;
  698. struct user_sdma_iovec *iovec = NULL;
  699. if (!req->pq)
  700. return -EINVAL;
  701. pq = req->pq;
  702. /* If tx completion has reported an error, we are done. */
  703. if (READ_ONCE(req->has_error))
  704. return -EFAULT;
  705. /*
  706. * Check if we might have sent the entire request already
  707. */
  708. if (unlikely(req->seqnum == req->info.npkts)) {
  709. if (!list_empty(&req->txps))
  710. goto dosend;
  711. return ret;
  712. }
  713. if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
  714. maxpkts = req->info.npkts - req->seqnum;
  715. while (npkts < maxpkts) {
  716. u32 datalen = 0, queued = 0, data_sent = 0;
  717. u64 iov_offset = 0;
  718. /*
  719. * Check whether any of the completions have come back
  720. * with errors. If so, we are not going to process any
  721. * more packets from this request.
  722. */
  723. if (READ_ONCE(req->has_error))
  724. return -EFAULT;
  725. tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
  726. if (!tx)
  727. return -ENOMEM;
  728. tx->flags = 0;
  729. tx->req = req;
  730. tx->busycount = 0;
  731. INIT_LIST_HEAD(&tx->list);
  732. /*
  733. * For the last packet set the ACK request
  734. * and disable header suppression.
  735. */
  736. if (req->seqnum == req->info.npkts - 1)
  737. tx->flags |= (TXREQ_FLAGS_REQ_ACK |
  738. TXREQ_FLAGS_REQ_DISABLE_SH);
  739. /*
  740. * Calculate the payload size - this is min of the fragment
  741. * (MTU) size or the remaining bytes in the request but only
  742. * if we have payload data.
  743. */
  744. if (req->data_len) {
  745. iovec = &req->iovs[req->iov_idx];
  746. if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
  747. if (++req->iov_idx == req->data_iovs) {
  748. ret = -EFAULT;
  749. goto free_tx;
  750. }
  751. iovec = &req->iovs[req->iov_idx];
  752. WARN_ON(iovec->offset);
  753. }
  754. datalen = compute_data_length(req, tx);
  755. /*
  756. * Disable header suppression for the payload <= 8DWS.
  757. * If there is an uncorrectable error in the receive
  758. * data FIFO when the received payload size is less than
  759. * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
  760. * not reported.There is set RHF.EccErr if the header
  761. * is not suppressed.
  762. */
  763. if (!datalen) {
  764. SDMA_DBG(req,
  765. "Request has data but pkt len is 0");
  766. ret = -EFAULT;
  767. goto free_tx;
  768. } else if (datalen <= 32) {
  769. tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
  770. }
  771. }
  772. if (req->ahg_idx >= 0) {
  773. if (!req->seqnum) {
  774. ret = user_sdma_txadd_ahg(req, tx, datalen);
  775. if (ret)
  776. goto free_tx;
  777. } else {
  778. int changes;
  779. changes = set_txreq_header_ahg(req, tx,
  780. datalen);
  781. if (changes < 0) {
  782. ret = changes;
  783. goto free_tx;
  784. }
  785. }
  786. } else {
  787. ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
  788. datalen, user_sdma_txreq_cb);
  789. if (ret)
  790. goto free_tx;
  791. /*
  792. * Modify the header for this packet. This only needs
  793. * to be done if we are not going to use AHG. Otherwise,
  794. * the HW will do it based on the changes we gave it
  795. * during sdma_txinit_ahg().
  796. */
  797. ret = set_txreq_header(req, tx, datalen);
  798. if (ret)
  799. goto free_txreq;
  800. }
  801. /*
  802. * If the request contains any data vectors, add up to
  803. * fragsize bytes to the descriptor.
  804. */
  805. while (queued < datalen &&
  806. (req->sent + data_sent) < req->data_len) {
  807. ret = user_sdma_txadd(req, tx, iovec, datalen,
  808. &queued, &data_sent, &iov_offset);
  809. if (ret)
  810. goto free_txreq;
  811. }
  812. /*
  813. * The txreq was submitted successfully so we can update
  814. * the counters.
  815. */
  816. req->koffset += datalen;
  817. if (req_opcode(req->info.ctrl) == EXPECTED)
  818. req->tidoffset += datalen;
  819. req->sent += data_sent;
  820. if (req->data_len)
  821. iovec->offset += iov_offset;
  822. list_add_tail(&tx->txreq.list, &req->txps);
  823. /*
  824. * It is important to increment this here as it is used to
  825. * generate the BTH.PSN and, therefore, can't be bulk-updated
  826. * outside of the loop.
  827. */
  828. tx->seqnum = req->seqnum++;
  829. npkts++;
  830. }
  831. dosend:
  832. ret = sdma_send_txlist(req->sde,
  833. iowait_get_ib_work(&pq->busy),
  834. &req->txps, &count);
  835. req->seqsubmitted += count;
  836. if (req->seqsubmitted == req->info.npkts) {
  837. /*
  838. * The txreq has already been submitted to the HW queue
  839. * so we can free the AHG entry now. Corruption will not
  840. * happen due to the sequential manner in which
  841. * descriptors are processed.
  842. */
  843. if (req->ahg_idx >= 0)
  844. sdma_ahg_free(req->sde, req->ahg_idx);
  845. }
  846. return ret;
  847. free_txreq:
  848. sdma_txclean(pq->dd, &tx->txreq);
  849. free_tx:
  850. kmem_cache_free(pq->txreq_cache, tx);
  851. return ret;
  852. }
  853. static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
  854. {
  855. struct evict_data evict_data;
  856. evict_data.cleared = 0;
  857. evict_data.target = npages;
  858. hfi1_mmu_rb_evict(pq->handler, &evict_data);
  859. return evict_data.cleared;
  860. }
  861. static int pin_sdma_pages(struct user_sdma_request *req,
  862. struct user_sdma_iovec *iovec,
  863. struct sdma_mmu_node *node,
  864. int npages)
  865. {
  866. int pinned, cleared;
  867. struct page **pages;
  868. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  869. pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
  870. if (!pages)
  871. return -ENOMEM;
  872. memcpy(pages, node->pages, node->npages * sizeof(*pages));
  873. npages -= node->npages;
  874. retry:
  875. if (!hfi1_can_pin_pages(pq->dd, pq->mm,
  876. atomic_read(&pq->n_locked), npages)) {
  877. cleared = sdma_cache_evict(pq, npages);
  878. if (cleared >= npages)
  879. goto retry;
  880. }
  881. pinned = hfi1_acquire_user_pages(pq->mm,
  882. ((unsigned long)iovec->iov.iov_base +
  883. (node->npages * PAGE_SIZE)), npages, 0,
  884. pages + node->npages);
  885. if (pinned < 0) {
  886. kfree(pages);
  887. return pinned;
  888. }
  889. if (pinned != npages) {
  890. unpin_vector_pages(pq->mm, pages, node->npages, pinned);
  891. return -EFAULT;
  892. }
  893. kfree(node->pages);
  894. node->rb.len = iovec->iov.iov_len;
  895. node->pages = pages;
  896. atomic_add(pinned, &pq->n_locked);
  897. return pinned;
  898. }
  899. static void unpin_sdma_pages(struct sdma_mmu_node *node)
  900. {
  901. if (node->npages) {
  902. unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
  903. atomic_sub(node->npages, &node->pq->n_locked);
  904. }
  905. }
  906. static int pin_vector_pages(struct user_sdma_request *req,
  907. struct user_sdma_iovec *iovec)
  908. {
  909. int ret = 0, pinned, npages;
  910. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  911. struct sdma_mmu_node *node = NULL;
  912. struct mmu_rb_node *rb_node;
  913. struct iovec *iov;
  914. bool extracted;
  915. extracted =
  916. hfi1_mmu_rb_remove_unless_exact(pq->handler,
  917. (unsigned long)
  918. iovec->iov.iov_base,
  919. iovec->iov.iov_len, &rb_node);
  920. if (rb_node) {
  921. node = container_of(rb_node, struct sdma_mmu_node, rb);
  922. if (!extracted) {
  923. atomic_inc(&node->refcount);
  924. iovec->pages = node->pages;
  925. iovec->npages = node->npages;
  926. iovec->node = node;
  927. return 0;
  928. }
  929. }
  930. if (!node) {
  931. node = kzalloc(sizeof(*node), GFP_KERNEL);
  932. if (!node)
  933. return -ENOMEM;
  934. node->rb.addr = (unsigned long)iovec->iov.iov_base;
  935. node->pq = pq;
  936. atomic_set(&node->refcount, 0);
  937. }
  938. iov = &iovec->iov;
  939. npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
  940. if (node->npages < npages) {
  941. pinned = pin_sdma_pages(req, iovec, node, npages);
  942. if (pinned < 0) {
  943. ret = pinned;
  944. goto bail;
  945. }
  946. node->npages += pinned;
  947. npages = node->npages;
  948. }
  949. iovec->pages = node->pages;
  950. iovec->npages = npages;
  951. iovec->node = node;
  952. ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
  953. if (ret) {
  954. iovec->node = NULL;
  955. goto bail;
  956. }
  957. return 0;
  958. bail:
  959. unpin_sdma_pages(node);
  960. kfree(node);
  961. return ret;
  962. }
  963. static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  964. unsigned start, unsigned npages)
  965. {
  966. hfi1_release_user_pages(mm, pages + start, npages, false);
  967. kfree(pages);
  968. }
  969. static int check_header_template(struct user_sdma_request *req,
  970. struct hfi1_pkt_header *hdr, u32 lrhlen,
  971. u32 datalen)
  972. {
  973. /*
  974. * Perform safety checks for any type of packet:
  975. * - transfer size is multiple of 64bytes
  976. * - packet length is multiple of 4 bytes
  977. * - packet length is not larger than MTU size
  978. *
  979. * These checks are only done for the first packet of the
  980. * transfer since the header is "given" to us by user space.
  981. * For the remainder of the packets we compute the values.
  982. */
  983. if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
  984. lrhlen > get_lrh_len(*hdr, req->info.fragsize))
  985. return -EINVAL;
  986. if (req_opcode(req->info.ctrl) == EXPECTED) {
  987. /*
  988. * The header is checked only on the first packet. Furthermore,
  989. * we ensure that at least one TID entry is copied when the
  990. * request is submitted. Therefore, we don't have to verify that
  991. * tididx points to something sane.
  992. */
  993. u32 tidval = req->tids[req->tididx],
  994. tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
  995. tididx = EXP_TID_GET(tidval, IDX),
  996. tidctrl = EXP_TID_GET(tidval, CTRL),
  997. tidoff;
  998. __le32 kval = hdr->kdeth.ver_tid_offset;
  999. tidoff = KDETH_GET(kval, OFFSET) *
  1000. (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
  1001. KDETH_OM_LARGE : KDETH_OM_SMALL);
  1002. /*
  1003. * Expected receive packets have the following
  1004. * additional checks:
  1005. * - offset is not larger than the TID size
  1006. * - TIDCtrl values match between header and TID array
  1007. * - TID indexes match between header and TID array
  1008. */
  1009. if ((tidoff + datalen > tidlen) ||
  1010. KDETH_GET(kval, TIDCTRL) != tidctrl ||
  1011. KDETH_GET(kval, TID) != tididx)
  1012. return -EINVAL;
  1013. }
  1014. return 0;
  1015. }
  1016. /*
  1017. * Correctly set the BTH.PSN field based on type of
  1018. * transfer - eager packets can just increment the PSN but
  1019. * expected packets encode generation and sequence in the
  1020. * BTH.PSN field so just incrementing will result in errors.
  1021. */
  1022. static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
  1023. {
  1024. u32 val = be32_to_cpu(bthpsn),
  1025. mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
  1026. 0xffffffull),
  1027. psn = val & mask;
  1028. if (expct)
  1029. psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
  1030. else
  1031. psn = psn + frags;
  1032. return psn & mask;
  1033. }
  1034. static int set_txreq_header(struct user_sdma_request *req,
  1035. struct user_sdma_txreq *tx, u32 datalen)
  1036. {
  1037. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1038. struct hfi1_pkt_header *hdr = &tx->hdr;
  1039. u8 omfactor; /* KDETH.OM */
  1040. u16 pbclen;
  1041. int ret;
  1042. u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  1043. /* Copy the header template to the request before modification */
  1044. memcpy(hdr, &req->hdr, sizeof(*hdr));
  1045. /*
  1046. * Check if the PBC and LRH length are mismatched. If so
  1047. * adjust both in the header.
  1048. */
  1049. pbclen = le16_to_cpu(hdr->pbc[0]);
  1050. if (PBC2LRH(pbclen) != lrhlen) {
  1051. pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
  1052. hdr->pbc[0] = cpu_to_le16(pbclen);
  1053. hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
  1054. /*
  1055. * Third packet
  1056. * This is the first packet in the sequence that has
  1057. * a "static" size that can be used for the rest of
  1058. * the packets (besides the last one).
  1059. */
  1060. if (unlikely(req->seqnum == 2)) {
  1061. /*
  1062. * From this point on the lengths in both the
  1063. * PBC and LRH are the same until the last
  1064. * packet.
  1065. * Adjust the template so we don't have to update
  1066. * every packet
  1067. */
  1068. req->hdr.pbc[0] = hdr->pbc[0];
  1069. req->hdr.lrh[2] = hdr->lrh[2];
  1070. }
  1071. }
  1072. /*
  1073. * We only have to modify the header if this is not the
  1074. * first packet in the request. Otherwise, we use the
  1075. * header given to us.
  1076. */
  1077. if (unlikely(!req->seqnum)) {
  1078. ret = check_header_template(req, hdr, lrhlen, datalen);
  1079. if (ret)
  1080. return ret;
  1081. goto done;
  1082. }
  1083. hdr->bth[2] = cpu_to_be32(
  1084. set_pkt_bth_psn(hdr->bth[2],
  1085. (req_opcode(req->info.ctrl) == EXPECTED),
  1086. req->seqnum));
  1087. /* Set ACK request on last packet */
  1088. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
  1089. hdr->bth[2] |= cpu_to_be32(1UL << 31);
  1090. /* Set the new offset */
  1091. hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
  1092. /* Expected packets have to fill in the new TID information */
  1093. if (req_opcode(req->info.ctrl) == EXPECTED) {
  1094. tidval = req->tids[req->tididx];
  1095. /*
  1096. * If the offset puts us at the end of the current TID,
  1097. * advance everything.
  1098. */
  1099. if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
  1100. PAGE_SIZE)) {
  1101. req->tidoffset = 0;
  1102. /*
  1103. * Since we don't copy all the TIDs, all at once,
  1104. * we have to check again.
  1105. */
  1106. if (++req->tididx > req->n_tids - 1 ||
  1107. !req->tids[req->tididx]) {
  1108. return -EINVAL;
  1109. }
  1110. tidval = req->tids[req->tididx];
  1111. }
  1112. omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
  1113. KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
  1114. KDETH_OM_SMALL_SHIFT;
  1115. /* Set KDETH.TIDCtrl based on value for this TID. */
  1116. KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
  1117. EXP_TID_GET(tidval, CTRL));
  1118. /* Set KDETH.TID based on value for this TID */
  1119. KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
  1120. EXP_TID_GET(tidval, IDX));
  1121. /* Clear KDETH.SH when DISABLE_SH flag is set */
  1122. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
  1123. KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
  1124. /*
  1125. * Set the KDETH.OFFSET and KDETH.OM based on size of
  1126. * transfer.
  1127. */
  1128. trace_hfi1_sdma_user_tid_info(
  1129. pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
  1130. req->tidoffset, req->tidoffset >> omfactor,
  1131. omfactor != KDETH_OM_SMALL_SHIFT);
  1132. KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
  1133. req->tidoffset >> omfactor);
  1134. KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
  1135. omfactor != KDETH_OM_SMALL_SHIFT);
  1136. }
  1137. done:
  1138. trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
  1139. req->info.comp_idx, hdr, tidval);
  1140. return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
  1141. }
  1142. static int set_txreq_header_ahg(struct user_sdma_request *req,
  1143. struct user_sdma_txreq *tx, u32 datalen)
  1144. {
  1145. u32 ahg[AHG_KDETH_ARRAY_SIZE];
  1146. int idx = 0;
  1147. u8 omfactor; /* KDETH.OM */
  1148. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1149. struct hfi1_pkt_header *hdr = &req->hdr;
  1150. u16 pbclen = le16_to_cpu(hdr->pbc[0]);
  1151. u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  1152. size_t array_size = ARRAY_SIZE(ahg);
  1153. if (PBC2LRH(pbclen) != lrhlen) {
  1154. /* PBC.PbcLengthDWs */
  1155. idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
  1156. (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
  1157. if (idx < 0)
  1158. return idx;
  1159. /* LRH.PktLen (we need the full 16 bits due to byte swap) */
  1160. idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
  1161. (__force u16)cpu_to_be16(lrhlen >> 2));
  1162. if (idx < 0)
  1163. return idx;
  1164. }
  1165. /*
  1166. * Do the common updates
  1167. */
  1168. /* BTH.PSN and BTH.A */
  1169. val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
  1170. (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
  1171. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
  1172. val32 |= 1UL << 31;
  1173. idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
  1174. (__force u16)cpu_to_be16(val32 >> 16));
  1175. if (idx < 0)
  1176. return idx;
  1177. idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
  1178. (__force u16)cpu_to_be16(val32 & 0xffff));
  1179. if (idx < 0)
  1180. return idx;
  1181. /* KDETH.Offset */
  1182. idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
  1183. (__force u16)cpu_to_le16(req->koffset & 0xffff));
  1184. if (idx < 0)
  1185. return idx;
  1186. idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
  1187. (__force u16)cpu_to_le16(req->koffset >> 16));
  1188. if (idx < 0)
  1189. return idx;
  1190. if (req_opcode(req->info.ctrl) == EXPECTED) {
  1191. __le16 val;
  1192. tidval = req->tids[req->tididx];
  1193. /*
  1194. * If the offset puts us at the end of the current TID,
  1195. * advance everything.
  1196. */
  1197. if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
  1198. PAGE_SIZE)) {
  1199. req->tidoffset = 0;
  1200. /*
  1201. * Since we don't copy all the TIDs, all at once,
  1202. * we have to check again.
  1203. */
  1204. if (++req->tididx > req->n_tids - 1 ||
  1205. !req->tids[req->tididx])
  1206. return -EINVAL;
  1207. tidval = req->tids[req->tididx];
  1208. }
  1209. omfactor = ((EXP_TID_GET(tidval, LEN) *
  1210. PAGE_SIZE) >=
  1211. KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
  1212. KDETH_OM_SMALL_SHIFT;
  1213. /* KDETH.OM and KDETH.OFFSET (TID) */
  1214. idx = ahg_header_set(
  1215. ahg, idx, array_size, 7, 0, 16,
  1216. ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
  1217. ((req->tidoffset >> omfactor)
  1218. & 0x7fff)));
  1219. if (idx < 0)
  1220. return idx;
  1221. /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
  1222. val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
  1223. (EXP_TID_GET(tidval, IDX) & 0x3ff));
  1224. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
  1225. val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
  1226. INTR) <<
  1227. AHG_KDETH_INTR_SHIFT));
  1228. } else {
  1229. val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
  1230. cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
  1231. cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
  1232. INTR) <<
  1233. AHG_KDETH_INTR_SHIFT));
  1234. }
  1235. idx = ahg_header_set(ahg, idx, array_size,
  1236. 7, 16, 14, (__force u16)val);
  1237. if (idx < 0)
  1238. return idx;
  1239. }
  1240. trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
  1241. req->info.comp_idx, req->sde->this_idx,
  1242. req->ahg_idx, ahg, idx, tidval);
  1243. sdma_txinit_ahg(&tx->txreq,
  1244. SDMA_TXREQ_F_USE_AHG,
  1245. datalen, req->ahg_idx, idx,
  1246. ahg, sizeof(req->hdr),
  1247. user_sdma_txreq_cb);
  1248. return idx;
  1249. }
  1250. /**
  1251. * user_sdma_txreq_cb() - SDMA tx request completion callback.
  1252. * @txreq: valid sdma tx request
  1253. * @status: success/failure of request
  1254. *
  1255. * Called when the SDMA progress state machine gets notification that
  1256. * the SDMA descriptors for this tx request have been processed by the
  1257. * DMA engine. Called in interrupt context.
  1258. * Only do work on completed sequences.
  1259. */
  1260. static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
  1261. {
  1262. struct user_sdma_txreq *tx =
  1263. container_of(txreq, struct user_sdma_txreq, txreq);
  1264. struct user_sdma_request *req;
  1265. struct hfi1_user_sdma_pkt_q *pq;
  1266. struct hfi1_user_sdma_comp_q *cq;
  1267. enum hfi1_sdma_comp_state state = COMPLETE;
  1268. if (!tx->req)
  1269. return;
  1270. req = tx->req;
  1271. pq = req->pq;
  1272. cq = req->cq;
  1273. if (status != SDMA_TXREQ_S_OK) {
  1274. SDMA_DBG(req, "SDMA completion with error %d",
  1275. status);
  1276. WRITE_ONCE(req->has_error, 1);
  1277. state = ERROR;
  1278. }
  1279. req->seqcomp = tx->seqnum;
  1280. kmem_cache_free(pq->txreq_cache, tx);
  1281. /* sequence isn't complete? We are done */
  1282. if (req->seqcomp != req->info.npkts - 1)
  1283. return;
  1284. user_sdma_free_request(req, false);
  1285. set_comp_state(pq, cq, req->info.comp_idx, state, status);
  1286. pq_update(pq);
  1287. }
  1288. static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
  1289. {
  1290. if (atomic_dec_and_test(&pq->n_reqs))
  1291. wake_up(&pq->wait);
  1292. }
  1293. static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
  1294. {
  1295. int i;
  1296. if (!list_empty(&req->txps)) {
  1297. struct sdma_txreq *t, *p;
  1298. list_for_each_entry_safe(t, p, &req->txps, list) {
  1299. struct user_sdma_txreq *tx =
  1300. container_of(t, struct user_sdma_txreq, txreq);
  1301. list_del_init(&t->list);
  1302. sdma_txclean(req->pq->dd, t);
  1303. kmem_cache_free(req->pq->txreq_cache, tx);
  1304. }
  1305. }
  1306. for (i = 0; i < req->data_iovs; i++) {
  1307. struct sdma_mmu_node *node = req->iovs[i].node;
  1308. if (!node)
  1309. continue;
  1310. req->iovs[i].node = NULL;
  1311. if (unpin)
  1312. hfi1_mmu_rb_remove(req->pq->handler,
  1313. &node->rb);
  1314. else
  1315. atomic_dec(&node->refcount);
  1316. }
  1317. kfree(req->tids);
  1318. clear_bit(req->info.comp_idx, req->pq->req_in_use);
  1319. }
  1320. static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  1321. struct hfi1_user_sdma_comp_q *cq,
  1322. u16 idx, enum hfi1_sdma_comp_state state,
  1323. int ret)
  1324. {
  1325. if (state == ERROR)
  1326. cq->comps[idx].errcode = -ret;
  1327. smp_wmb(); /* make sure errcode is visible first */
  1328. cq->comps[idx].status = state;
  1329. trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
  1330. idx, state, ret);
  1331. }
  1332. static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  1333. unsigned long len)
  1334. {
  1335. return (bool)(node->addr == addr);
  1336. }
  1337. static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
  1338. {
  1339. struct sdma_mmu_node *node =
  1340. container_of(mnode, struct sdma_mmu_node, rb);
  1341. atomic_inc(&node->refcount);
  1342. return 0;
  1343. }
  1344. /*
  1345. * Return 1 to remove the node from the rb tree and call the remove op.
  1346. *
  1347. * Called with the rb tree lock held.
  1348. */
  1349. static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  1350. void *evict_arg, bool *stop)
  1351. {
  1352. struct sdma_mmu_node *node =
  1353. container_of(mnode, struct sdma_mmu_node, rb);
  1354. struct evict_data *evict_data = evict_arg;
  1355. /* is this node still being used? */
  1356. if (atomic_read(&node->refcount))
  1357. return 0; /* keep this node */
  1358. /* this node will be evicted, add its pages to our count */
  1359. evict_data->cleared += node->npages;
  1360. /* have enough pages been cleared? */
  1361. if (evict_data->cleared >= evict_data->target)
  1362. *stop = true;
  1363. return 1; /* remove this node */
  1364. }
  1365. static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
  1366. {
  1367. struct sdma_mmu_node *node =
  1368. container_of(mnode, struct sdma_mmu_node, rb);
  1369. unpin_sdma_pages(node);
  1370. kfree(node);
  1371. }
  1372. static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
  1373. {
  1374. struct sdma_mmu_node *node =
  1375. container_of(mnode, struct sdma_mmu_node, rb);
  1376. if (!atomic_read(&node->refcount))
  1377. return 1;
  1378. return 0;
  1379. }