|
@@ -1,4 +1,5 @@
|
|
|
/*
|
|
|
+ * Copyright (c) 2016 Oracle. All rights reserved.
|
|
|
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
|
|
|
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
|
|
|
*
|
|
@@ -40,6 +41,63 @@
|
|
|
* Author: Tom Tucker <tom@opengridcomputing.com>
|
|
|
*/
|
|
|
|
|
|
+/* Operation
|
|
|
+ *
|
|
|
+ * The main entry point is svc_rdma_sendto. This is called by the
|
|
|
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
|
|
|
+ *
|
|
|
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
|
|
|
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
|
|
|
+ * transport header, post all Write WRs needed for this Reply, then post
|
|
|
+ * a Send WR conveying the transport header and the RPC message itself to
|
|
|
+ * the client.
|
|
|
+ *
|
|
|
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
|
|
|
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
|
|
|
+ * resources referred to by the svc_rqst are also recycled at that time.
|
|
|
+ * Therefore any resources that must remain longer must be detached
|
|
|
+ * from the svc_rqst and released later.
|
|
|
+ *
|
|
|
+ * Page Management
|
|
|
+ *
|
|
|
+ * The I/O that performs Reply transmission is asynchronous, and may
|
|
|
+ * complete well after sendto returns. Thus pages under I/O must be
|
|
|
+ * removed from the svc_rqst before sendto returns.
|
|
|
+ *
|
|
|
+ * The logic here depends on Send Queue and completion ordering. Since
|
|
|
+ * the Send WR is always posted last, it will always complete last. Thus
|
|
|
+ * when it completes, it is guaranteed that all previous Write WRs have
|
|
|
+ * also completed.
|
|
|
+ *
|
|
|
+ * Write WRs are constructed and posted. Each Write segment gets its own
|
|
|
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
|
|
|
+ * DMA-unmap the pages under I/O for that Write segment. The Write
|
|
|
+ * completion handler does not release any pages.
|
|
|
+ *
|
|
|
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
|
|
|
+ * The ownership of all of the Reply's pages are transferred into that
|
|
|
+ * ctxt, the Send WR is posted, and sendto returns.
|
|
|
+ *
|
|
|
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
|
|
|
+ * Send completion handler finally releases the Reply's pages.
|
|
|
+ *
|
|
|
+ * This mechanism also assumes that completions on the transport's Send
|
|
|
+ * Completion Queue do not run in parallel. Otherwise a Write completion
|
|
|
+ * and Send completion running at the same time could release pages that
|
|
|
+ * are still DMA-mapped.
|
|
|
+ *
|
|
|
+ * Error Handling
|
|
|
+ *
|
|
|
+ * - If the Send WR is posted successfully, it will either complete
|
|
|
+ * successfully, or get flushed. Either way, the Send completion
|
|
|
+ * handler releases the Reply's pages.
|
|
|
+ * - If the Send WR cannot be not posted, the forward path releases
|
|
|
+ * the Reply's pages.
|
|
|
+ *
|
|
|
+ * This handles the case, without the use of page reference counting,
|
|
|
+ * where two different Write segments send portions of the same page.
|
|
|
+ */
|
|
|
+
|
|
|
#include <linux/sunrpc/debug.h>
|
|
|
#include <linux/sunrpc/rpc_rdma.h>
|
|
|
#include <linux/spinlock.h>
|
|
@@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len)
|
|
|
return (len & 3) ? (4 - (len & 3)) : 0;
|
|
|
}
|
|
|
|
|
|
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
|
|
|
- struct xdr_buf *xdr,
|
|
|
- struct svc_rdma_req_map *vec,
|
|
|
- bool write_chunk_present)
|
|
|
+/* Returns length of transport header, in bytes.
|
|
|
+ */
|
|
|
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
|
|
|
{
|
|
|
- int sge_no;
|
|
|
- u32 sge_bytes;
|
|
|
- u32 page_bytes;
|
|
|
- u32 page_off;
|
|
|
- int page_no;
|
|
|
-
|
|
|
- if (xdr->len !=
|
|
|
- (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
|
|
|
- pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
|
|
|
- return -EIO;
|
|
|
- }
|
|
|
+ unsigned int nsegs;
|
|
|
+ __be32 *p;
|
|
|
|
|
|
- /* Skip the first sge, this is for the RPCRDMA header */
|
|
|
- sge_no = 1;
|
|
|
+ p = rdma_resp;
|
|
|
+
|
|
|
+ /* RPC-over-RDMA V1 replies never have a Read list. */
|
|
|
+ p += rpcrdma_fixed_maxsz + 1;
|
|
|
|
|
|
- /* Head SGE */
|
|
|
- vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
|
|
|
- vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
|
|
|
- sge_no++;
|
|
|
-
|
|
|
- /* pages SGE */
|
|
|
- page_no = 0;
|
|
|
- page_bytes = xdr->page_len;
|
|
|
- page_off = xdr->page_base;
|
|
|
- while (page_bytes) {
|
|
|
- vec->sge[sge_no].iov_base =
|
|
|
- page_address(xdr->pages[page_no]) + page_off;
|
|
|
- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
|
|
|
- page_bytes -= sge_bytes;
|
|
|
- vec->sge[sge_no].iov_len = sge_bytes;
|
|
|
-
|
|
|
- sge_no++;
|
|
|
- page_no++;
|
|
|
- page_off = 0; /* reset for next time through loop */
|
|
|
+ /* Skip Write list. */
|
|
|
+ while (*p++ != xdr_zero) {
|
|
|
+ nsegs = be32_to_cpup(p++);
|
|
|
+ p += nsegs * rpcrdma_segment_maxsz;
|
|
|
}
|
|
|
|
|
|
- /* Tail SGE */
|
|
|
- if (xdr->tail[0].iov_len) {
|
|
|
- unsigned char *base = xdr->tail[0].iov_base;
|
|
|
- size_t len = xdr->tail[0].iov_len;
|
|
|
- u32 xdr_pad = xdr_padsize(xdr->page_len);
|
|
|
+ /* Skip Reply chunk. */
|
|
|
+ if (*p++ != xdr_zero) {
|
|
|
+ nsegs = be32_to_cpup(p++);
|
|
|
+ p += nsegs * rpcrdma_segment_maxsz;
|
|
|
+ }
|
|
|
|
|
|
- if (write_chunk_present && xdr_pad) {
|
|
|
- base += xdr_pad;
|
|
|
- len -= xdr_pad;
|
|
|
- }
|
|
|
+ return (unsigned long)p - (unsigned long)rdma_resp;
|
|
|
+}
|
|
|
|
|
|
- if (len) {
|
|
|
- vec->sge[sge_no].iov_base = base;
|
|
|
- vec->sge[sge_no].iov_len = len;
|
|
|
- sge_no++;
|
|
|
+/* One Write chunk is copied from Call transport header to Reply
|
|
|
+ * transport header. Each segment's length field is updated to
|
|
|
+ * reflect number of bytes consumed in the segment.
|
|
|
+ *
|
|
|
+ * Returns number of segments in this chunk.
|
|
|
+ */
|
|
|
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
|
|
|
+ unsigned int remaining)
|
|
|
+{
|
|
|
+ unsigned int i, nsegs;
|
|
|
+ u32 seg_len;
|
|
|
+
|
|
|
+ /* Write list discriminator */
|
|
|
+ *dst++ = *src++;
|
|
|
+
|
|
|
+ /* number of segments in this chunk */
|
|
|
+ nsegs = be32_to_cpup(src);
|
|
|
+ *dst++ = *src++;
|
|
|
+
|
|
|
+ for (i = nsegs; i; i--) {
|
|
|
+ /* segment's RDMA handle */
|
|
|
+ *dst++ = *src++;
|
|
|
+
|
|
|
+ /* bytes returned in this segment */
|
|
|
+ seg_len = be32_to_cpu(*src);
|
|
|
+ if (remaining >= seg_len) {
|
|
|
+ /* entire segment was consumed */
|
|
|
+ *dst = *src;
|
|
|
+ remaining -= seg_len;
|
|
|
+ } else {
|
|
|
+ /* segment only partly filled */
|
|
|
+ *dst = cpu_to_be32(remaining);
|
|
|
+ remaining = 0;
|
|
|
}
|
|
|
- }
|
|
|
+ dst++; src++;
|
|
|
|
|
|
- dprintk("svcrdma: %s: sge_no %d page_no %d "
|
|
|
- "page_base %u page_len %u head_len %zu tail_len %zu\n",
|
|
|
- __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
|
|
|
- xdr->head[0].iov_len, xdr->tail[0].iov_len);
|
|
|
+ /* segment's RDMA offset */
|
|
|
+ *dst++ = *src++;
|
|
|
+ *dst++ = *src++;
|
|
|
+ }
|
|
|
|
|
|
- vec->count = sge_no;
|
|
|
- return 0;
|
|
|
+ return nsegs;
|
|
|
}
|
|
|
|
|
|
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
|
|
|
- struct xdr_buf *xdr,
|
|
|
- u32 xdr_off, size_t len, int dir)
|
|
|
+/* The client provided a Write list in the Call message. Fill in
|
|
|
+ * the segments in the first Write chunk in the Reply's transport
|
|
|
+ * header with the number of bytes consumed in each segment.
|
|
|
+ * Remaining chunks are returned unused.
|
|
|
+ *
|
|
|
+ * Assumptions:
|
|
|
+ * - Client has provided only one Write chunk
|
|
|
+ */
|
|
|
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
|
|
|
+ unsigned int consumed)
|
|
|
{
|
|
|
- struct page *page;
|
|
|
- dma_addr_t dma_addr;
|
|
|
- if (xdr_off < xdr->head[0].iov_len) {
|
|
|
- /* This offset is in the head */
|
|
|
- xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
|
|
|
- page = virt_to_page(xdr->head[0].iov_base);
|
|
|
- } else {
|
|
|
- xdr_off -= xdr->head[0].iov_len;
|
|
|
- if (xdr_off < xdr->page_len) {
|
|
|
- /* This offset is in the page list */
|
|
|
- xdr_off += xdr->page_base;
|
|
|
- page = xdr->pages[xdr_off >> PAGE_SHIFT];
|
|
|
- xdr_off &= ~PAGE_MASK;
|
|
|
- } else {
|
|
|
- /* This offset is in the tail */
|
|
|
- xdr_off -= xdr->page_len;
|
|
|
- xdr_off += (unsigned long)
|
|
|
- xdr->tail[0].iov_base & ~PAGE_MASK;
|
|
|
- page = virt_to_page(xdr->tail[0].iov_base);
|
|
|
- }
|
|
|
+ unsigned int nsegs;
|
|
|
+ __be32 *p, *q;
|
|
|
+
|
|
|
+ /* RPC-over-RDMA V1 replies never have a Read list. */
|
|
|
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
|
|
|
+
|
|
|
+ q = wr_ch;
|
|
|
+ while (*q != xdr_zero) {
|
|
|
+ nsegs = xdr_encode_write_chunk(p, q, consumed);
|
|
|
+ q += 2 + nsegs * rpcrdma_segment_maxsz;
|
|
|
+ p += 2 + nsegs * rpcrdma_segment_maxsz;
|
|
|
+ consumed = 0;
|
|
|
}
|
|
|
- dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
|
|
|
- min_t(size_t, PAGE_SIZE, len), dir);
|
|
|
- return dma_addr;
|
|
|
+
|
|
|
+ /* Terminate Write list */
|
|
|
+ *p++ = xdr_zero;
|
|
|
+
|
|
|
+ /* Reply chunk discriminator; may be replaced later */
|
|
|
+ *p = xdr_zero;
|
|
|
+}
|
|
|
+
|
|
|
+/* The client provided a Reply chunk in the Call message. Fill in
|
|
|
+ * the segments in the Reply chunk in the Reply message with the
|
|
|
+ * number of bytes consumed in each segment.
|
|
|
+ *
|
|
|
+ * Assumptions:
|
|
|
+ * - Reply can always fit in the provided Reply chunk
|
|
|
+ */
|
|
|
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
|
|
|
+ unsigned int consumed)
|
|
|
+{
|
|
|
+ __be32 *p;
|
|
|
+
|
|
|
+ /* Find the Reply chunk in the Reply's xprt header.
|
|
|
+ * RPC-over-RDMA V1 replies never have a Read list.
|
|
|
+ */
|
|
|
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
|
|
|
+
|
|
|
+ /* Skip past Write list */
|
|
|
+ while (*p++ != xdr_zero)
|
|
|
+ p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
|
|
|
+
|
|
|
+ xdr_encode_write_chunk(p, rp_ch, consumed);
|
|
|
}
|
|
|
|
|
|
/* Parse the RPC Call's transport header.
|
|
|
*/
|
|
|
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
|
|
|
- struct rpcrdma_write_array **write,
|
|
|
- struct rpcrdma_write_array **reply)
|
|
|
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
|
|
|
+ __be32 **write, __be32 **reply)
|
|
|
{
|
|
|
__be32 *p;
|
|
|
|
|
|
- p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
|
|
|
+ p = rdma_argp + rpcrdma_fixed_maxsz;
|
|
|
|
|
|
/* Read list */
|
|
|
while (*p++ != xdr_zero)
|
|
@@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
|
|
|
|
|
|
/* Write list */
|
|
|
if (*p != xdr_zero) {
|
|
|
- *write = (struct rpcrdma_write_array *)p;
|
|
|
+ *write = p;
|
|
|
while (*p++ != xdr_zero)
|
|
|
p += 1 + be32_to_cpu(*p) * 4;
|
|
|
} else {
|
|
@@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
|
|
|
|
|
|
/* Reply chunk */
|
|
|
if (*p != xdr_zero)
|
|
|
- *reply = (struct rpcrdma_write_array *)p;
|
|
|
+ *reply = p;
|
|
|
else
|
|
|
*reply = NULL;
|
|
|
}
|
|
@@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
|
|
|
* Invalidate, and responder chooses one rkey to invalidate.
|
|
|
*
|
|
|
* Find a candidate rkey to invalidate when sending a reply. Picks the
|
|
|
- * first rkey it finds in the chunks lists.
|
|
|
+ * first R_key it finds in the chunk lists.
|
|
|
*
|
|
|
* Returns zero if RPC's chunk lists are empty.
|
|
|
*/
|
|
|
-static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
|
|
|
- struct rpcrdma_write_array *wr_ary,
|
|
|
- struct rpcrdma_write_array *rp_ary)
|
|
|
+static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
|
|
|
+ __be32 *wr_lst, __be32 *rp_ch)
|
|
|
{
|
|
|
- struct rpcrdma_read_chunk *rd_ary;
|
|
|
- struct rpcrdma_segment *arg_ch;
|
|
|
+ __be32 *p;
|
|
|
|
|
|
- rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
|
|
|
- if (rd_ary->rc_discrim != xdr_zero)
|
|
|
- return be32_to_cpu(rd_ary->rc_target.rs_handle);
|
|
|
+ p = rdma_argp + rpcrdma_fixed_maxsz;
|
|
|
+ if (*p != xdr_zero)
|
|
|
+ p += 2;
|
|
|
+ else if (wr_lst && be32_to_cpup(wr_lst + 1))
|
|
|
+ p = wr_lst + 2;
|
|
|
+ else if (rp_ch && be32_to_cpup(rp_ch + 1))
|
|
|
+ p = rp_ch + 2;
|
|
|
+ else
|
|
|
+ return 0;
|
|
|
+ return be32_to_cpup(p);
|
|
|
+}
|
|
|
|
|
|
- if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
|
|
|
- arg_ch = &wr_ary->wc_array[0].wc_target;
|
|
|
- return be32_to_cpu(arg_ch->rs_handle);
|
|
|
- }
|
|
|
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
|
|
|
+ * is used during completion to DMA-unmap this memory, and
|
|
|
+ * it uses ib_dma_unmap_page() exclusively.
|
|
|
+ */
|
|
|
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
|
|
|
+ struct svc_rdma_op_ctxt *ctxt,
|
|
|
+ unsigned int sge_no,
|
|
|
+ unsigned char *base,
|
|
|
+ unsigned int len)
|
|
|
+{
|
|
|
+ unsigned long offset = (unsigned long)base & ~PAGE_MASK;
|
|
|
+ struct ib_device *dev = rdma->sc_cm_id->device;
|
|
|
+ dma_addr_t dma_addr;
|
|
|
|
|
|
- if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
|
|
|
- arg_ch = &rp_ary->wc_array[0].wc_target;
|
|
|
- return be32_to_cpu(arg_ch->rs_handle);
|
|
|
- }
|
|
|
+ dma_addr = ib_dma_map_page(dev, virt_to_page(base),
|
|
|
+ offset, len, DMA_TO_DEVICE);
|
|
|
+ if (ib_dma_mapping_error(dev, dma_addr))
|
|
|
+ return -EIO;
|
|
|
|
|
|
+ ctxt->sge[sge_no].addr = dma_addr;
|
|
|
+ ctxt->sge[sge_no].length = len;
|
|
|
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
|
|
|
+ svc_rdma_count_mappings(rdma, ctxt);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-/* Assumptions:
|
|
|
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
|
|
|
- */
|
|
|
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
|
|
|
- u32 rmr, u64 to,
|
|
|
- u32 xdr_off, int write_len,
|
|
|
- struct svc_rdma_req_map *vec)
|
|
|
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
|
|
|
+ struct svc_rdma_op_ctxt *ctxt,
|
|
|
+ unsigned int sge_no,
|
|
|
+ struct page *page,
|
|
|
+ unsigned int offset,
|
|
|
+ unsigned int len)
|
|
|
{
|
|
|
- struct ib_rdma_wr write_wr;
|
|
|
- struct ib_sge *sge;
|
|
|
- int xdr_sge_no;
|
|
|
- int sge_no;
|
|
|
- int sge_bytes;
|
|
|
- int sge_off;
|
|
|
- int bc;
|
|
|
- struct svc_rdma_op_ctxt *ctxt;
|
|
|
+ struct ib_device *dev = rdma->sc_cm_id->device;
|
|
|
+ dma_addr_t dma_addr;
|
|
|
|
|
|
- if (vec->count > RPCSVC_MAXPAGES) {
|
|
|
- pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
|
|
|
+ dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
|
|
|
+ if (ib_dma_mapping_error(dev, dma_addr))
|
|
|
return -EIO;
|
|
|
- }
|
|
|
|
|
|
- dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
|
|
|
- "write_len=%d, vec->sge=%p, vec->count=%lu\n",
|
|
|
- rmr, (unsigned long long)to, xdr_off,
|
|
|
- write_len, vec->sge, vec->count);
|
|
|
+ ctxt->sge[sge_no].addr = dma_addr;
|
|
|
+ ctxt->sge[sge_no].length = len;
|
|
|
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
|
|
|
+ svc_rdma_count_mappings(rdma, ctxt);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
|
|
|
- ctxt = svc_rdma_get_context(xprt);
|
|
|
+/**
|
|
|
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
|
|
|
+ * @rdma: controlling transport
|
|
|
+ * @ctxt: op_ctxt for the Send WR
|
|
|
+ * @rdma_resp: buffer containing transport header
|
|
|
+ * @len: length of transport header
|
|
|
+ *
|
|
|
+ * Returns:
|
|
|
+ * %0 if the header is DMA mapped,
|
|
|
+ * %-EIO if DMA mapping failed.
|
|
|
+ */
|
|
|
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
|
|
|
+ struct svc_rdma_op_ctxt *ctxt,
|
|
|
+ __be32 *rdma_resp,
|
|
|
+ unsigned int len)
|
|
|
+{
|
|
|
ctxt->direction = DMA_TO_DEVICE;
|
|
|
- sge = ctxt->sge;
|
|
|
-
|
|
|
- /* Find the SGE associated with xdr_off */
|
|
|
- for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
|
|
|
- xdr_sge_no++) {
|
|
|
- if (vec->sge[xdr_sge_no].iov_len > bc)
|
|
|
- break;
|
|
|
- bc -= vec->sge[xdr_sge_no].iov_len;
|
|
|
- }
|
|
|
-
|
|
|
- sge_off = bc;
|
|
|
- bc = write_len;
|
|
|
- sge_no = 0;
|
|
|
-
|
|
|
- /* Copy the remaining SGE */
|
|
|
- while (bc != 0) {
|
|
|
- sge_bytes = min_t(size_t,
|
|
|
- bc, vec->sge[xdr_sge_no].iov_len-sge_off);
|
|
|
- sge[sge_no].length = sge_bytes;
|
|
|
- sge[sge_no].addr =
|
|
|
- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
|
|
|
- sge_bytes, DMA_TO_DEVICE);
|
|
|
- xdr_off += sge_bytes;
|
|
|
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
|
|
|
- sge[sge_no].addr))
|
|
|
- goto err;
|
|
|
- svc_rdma_count_mappings(xprt, ctxt);
|
|
|
- sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
|
|
|
- ctxt->count++;
|
|
|
- sge_off = 0;
|
|
|
- sge_no++;
|
|
|
- xdr_sge_no++;
|
|
|
- if (xdr_sge_no > vec->count) {
|
|
|
- pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
|
|
|
- goto err;
|
|
|
- }
|
|
|
- bc -= sge_bytes;
|
|
|
- if (sge_no == xprt->sc_max_sge)
|
|
|
- break;
|
|
|
- }
|
|
|
-
|
|
|
- /* Prepare WRITE WR */
|
|
|
- memset(&write_wr, 0, sizeof write_wr);
|
|
|
- ctxt->cqe.done = svc_rdma_wc_write;
|
|
|
- write_wr.wr.wr_cqe = &ctxt->cqe;
|
|
|
- write_wr.wr.sg_list = &sge[0];
|
|
|
- write_wr.wr.num_sge = sge_no;
|
|
|
- write_wr.wr.opcode = IB_WR_RDMA_WRITE;
|
|
|
- write_wr.wr.send_flags = IB_SEND_SIGNALED;
|
|
|
- write_wr.rkey = rmr;
|
|
|
- write_wr.remote_addr = to;
|
|
|
-
|
|
|
- /* Post It */
|
|
|
- atomic_inc(&rdma_stat_write);
|
|
|
- if (svc_rdma_send(xprt, &write_wr.wr))
|
|
|
- goto err;
|
|
|
- return write_len - bc;
|
|
|
- err:
|
|
|
- svc_rdma_unmap_dma(ctxt);
|
|
|
- svc_rdma_put_context(ctxt, 0);
|
|
|
- return -EIO;
|
|
|
+ ctxt->pages[0] = virt_to_page(rdma_resp);
|
|
|
+ ctxt->count = 1;
|
|
|
+ return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
|
|
|
}
|
|
|
|
|
|
-noinline
|
|
|
-static int send_write_chunks(struct svcxprt_rdma *xprt,
|
|
|
- struct rpcrdma_write_array *wr_ary,
|
|
|
- struct rpcrdma_msg *rdma_resp,
|
|
|
- struct svc_rqst *rqstp,
|
|
|
- struct svc_rdma_req_map *vec)
|
|
|
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
|
|
|
+ * element as it is added.
|
|
|
+ *
|
|
|
+ * Returns the number of sge elements loaded on success, or
|
|
|
+ * a negative errno on failure.
|
|
|
+ */
|
|
|
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
|
|
|
+ struct svc_rdma_op_ctxt *ctxt,
|
|
|
+ struct xdr_buf *xdr, __be32 *wr_lst)
|
|
|
{
|
|
|
- u32 xfer_len = rqstp->rq_res.page_len;
|
|
|
- int write_len;
|
|
|
- u32 xdr_off;
|
|
|
- int chunk_off;
|
|
|
- int chunk_no;
|
|
|
- int nchunks;
|
|
|
- struct rpcrdma_write_array *res_ary;
|
|
|
+ unsigned int len, sge_no, remaining, page_off;
|
|
|
+ struct page **ppages;
|
|
|
+ unsigned char *base;
|
|
|
+ u32 xdr_pad;
|
|
|
int ret;
|
|
|
|
|
|
- res_ary = (struct rpcrdma_write_array *)
|
|
|
- &rdma_resp->rm_body.rm_chunks[1];
|
|
|
-
|
|
|
- /* Write chunks start at the pagelist */
|
|
|
- nchunks = be32_to_cpu(wr_ary->wc_nchunks);
|
|
|
- for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
|
|
|
- xfer_len && chunk_no < nchunks;
|
|
|
- chunk_no++) {
|
|
|
- struct rpcrdma_segment *arg_ch;
|
|
|
- u64 rs_offset;
|
|
|
-
|
|
|
- arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
|
|
|
- write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
|
|
|
-
|
|
|
- /* Prepare the response chunk given the length actually
|
|
|
- * written */
|
|
|
- xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
|
|
|
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
|
|
|
- arg_ch->rs_handle,
|
|
|
- arg_ch->rs_offset,
|
|
|
- write_len);
|
|
|
- chunk_off = 0;
|
|
|
- while (write_len) {
|
|
|
- ret = send_write(xprt, rqstp,
|
|
|
- be32_to_cpu(arg_ch->rs_handle),
|
|
|
- rs_offset + chunk_off,
|
|
|
- xdr_off,
|
|
|
- write_len,
|
|
|
- vec);
|
|
|
- if (ret <= 0)
|
|
|
- goto out_err;
|
|
|
- chunk_off += ret;
|
|
|
- xdr_off += ret;
|
|
|
- xfer_len -= ret;
|
|
|
- write_len -= ret;
|
|
|
+ sge_no = 1;
|
|
|
+
|
|
|
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
|
|
|
+ xdr->head[0].iov_base,
|
|
|
+ xdr->head[0].iov_len);
|
|
|
+ if (ret < 0)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ /* If a Write chunk is present, the xdr_buf's page list
|
|
|
+ * is not included inline. However the Upper Layer may
|
|
|
+ * have added XDR padding in the tail buffer, and that
|
|
|
+ * should not be included inline.
|
|
|
+ */
|
|
|
+ if (wr_lst) {
|
|
|
+ base = xdr->tail[0].iov_base;
|
|
|
+ len = xdr->tail[0].iov_len;
|
|
|
+ xdr_pad = xdr_padsize(xdr->page_len);
|
|
|
+
|
|
|
+ if (len && xdr_pad) {
|
|
|
+ base += xdr_pad;
|
|
|
+ len -= xdr_pad;
|
|
|
}
|
|
|
+
|
|
|
+ goto tail;
|
|
|
+ }
|
|
|
+
|
|
|
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
|
|
|
+ page_off = xdr->page_base & ~PAGE_MASK;
|
|
|
+ remaining = xdr->page_len;
|
|
|
+ while (remaining) {
|
|
|
+ len = min_t(u32, PAGE_SIZE - page_off, remaining);
|
|
|
+
|
|
|
+ ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
|
|
|
+ *ppages++, page_off, len);
|
|
|
+ if (ret < 0)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ remaining -= len;
|
|
|
+ page_off = 0;
|
|
|
}
|
|
|
- /* Update the req with the number of chunks actually used */
|
|
|
- svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
|
|
|
|
|
|
- return rqstp->rq_res.page_len;
|
|
|
+ base = xdr->tail[0].iov_base;
|
|
|
+ len = xdr->tail[0].iov_len;
|
|
|
+tail:
|
|
|
+ if (len) {
|
|
|
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
|
|
|
+ if (ret < 0)
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
|
|
|
-out_err:
|
|
|
- pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
|
|
|
- return -EIO;
|
|
|
+ return sge_no - 1;
|
|
|
}
|
|
|
|
|
|
-noinline
|
|
|
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
|
|
|
- struct rpcrdma_write_array *rp_ary,
|
|
|
- struct rpcrdma_msg *rdma_resp,
|
|
|
- struct svc_rqst *rqstp,
|
|
|
- struct svc_rdma_req_map *vec)
|
|
|
+/* The svc_rqst and all resources it owns are released as soon as
|
|
|
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
|
|
|
+ * so they are released by the Send completion handler.
|
|
|
+ */
|
|
|
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
|
|
|
+ struct svc_rdma_op_ctxt *ctxt)
|
|
|
{
|
|
|
- u32 xfer_len = rqstp->rq_res.len;
|
|
|
- int write_len;
|
|
|
- u32 xdr_off;
|
|
|
- int chunk_no;
|
|
|
- int chunk_off;
|
|
|
- int nchunks;
|
|
|
- struct rpcrdma_segment *ch;
|
|
|
- struct rpcrdma_write_array *res_ary;
|
|
|
- int ret;
|
|
|
+ int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
|
|
|
|
|
|
- /* XXX: need to fix when reply lists occur with read-list and or
|
|
|
- * write-list */
|
|
|
- res_ary = (struct rpcrdma_write_array *)
|
|
|
- &rdma_resp->rm_body.rm_chunks[2];
|
|
|
-
|
|
|
- /* xdr offset starts at RPC message */
|
|
|
- nchunks = be32_to_cpu(rp_ary->wc_nchunks);
|
|
|
- for (xdr_off = 0, chunk_no = 0;
|
|
|
- xfer_len && chunk_no < nchunks;
|
|
|
- chunk_no++) {
|
|
|
- u64 rs_offset;
|
|
|
- ch = &rp_ary->wc_array[chunk_no].wc_target;
|
|
|
- write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
|
|
|
-
|
|
|
- /* Prepare the reply chunk given the length actually
|
|
|
- * written */
|
|
|
- xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
|
|
|
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
|
|
|
- ch->rs_handle, ch->rs_offset,
|
|
|
- write_len);
|
|
|
- chunk_off = 0;
|
|
|
- while (write_len) {
|
|
|
- ret = send_write(xprt, rqstp,
|
|
|
- be32_to_cpu(ch->rs_handle),
|
|
|
- rs_offset + chunk_off,
|
|
|
- xdr_off,
|
|
|
- write_len,
|
|
|
- vec);
|
|
|
- if (ret <= 0)
|
|
|
- goto out_err;
|
|
|
- chunk_off += ret;
|
|
|
- xdr_off += ret;
|
|
|
- xfer_len -= ret;
|
|
|
- write_len -= ret;
|
|
|
- }
|
|
|
+ ctxt->count += pages;
|
|
|
+ for (i = 0; i < pages; i++) {
|
|
|
+ ctxt->pages[i + 1] = rqstp->rq_respages[i];
|
|
|
+ rqstp->rq_respages[i] = NULL;
|
|
|
}
|
|
|
- /* Update the req with the number of chunks actually used */
|
|
|
- svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
|
|
|
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
|
|
|
+}
|
|
|
|
|
|
- return rqstp->rq_res.len;
|
|
|
+/**
|
|
|
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
|
|
|
+ * @rdma: controlling transport
|
|
|
+ * @ctxt: op_ctxt for transmitting the Send WR
|
|
|
+ * @num_sge: number of SGEs to send
|
|
|
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
|
|
|
+ *
|
|
|
+ * Returns:
|
|
|
+ * %0 if the Send* was posted successfully,
|
|
|
+ * %-ENOTCONN if the connection was lost or dropped,
|
|
|
+ * %-EINVAL if there was a problem with the Send we built,
|
|
|
+ * %-ENOMEM if ib_post_send failed.
|
|
|
+ */
|
|
|
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
|
|
|
+ struct svc_rdma_op_ctxt *ctxt, int num_sge,
|
|
|
+ u32 inv_rkey)
|
|
|
+{
|
|
|
+ struct ib_send_wr *send_wr = &ctxt->send_wr;
|
|
|
|
|
|
-out_err:
|
|
|
- pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
|
|
|
- return -EIO;
|
|
|
+ dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
|
|
|
+
|
|
|
+ send_wr->next = NULL;
|
|
|
+ ctxt->cqe.done = svc_rdma_wc_send;
|
|
|
+ send_wr->wr_cqe = &ctxt->cqe;
|
|
|
+ send_wr->sg_list = ctxt->sge;
|
|
|
+ send_wr->num_sge = num_sge;
|
|
|
+ send_wr->send_flags = IB_SEND_SIGNALED;
|
|
|
+ if (inv_rkey) {
|
|
|
+ send_wr->opcode = IB_WR_SEND_WITH_INV;
|
|
|
+ send_wr->ex.invalidate_rkey = inv_rkey;
|
|
|
+ } else {
|
|
|
+ send_wr->opcode = IB_WR_SEND;
|
|
|
+ }
|
|
|
+
|
|
|
+ return svc_rdma_send(rdma, send_wr);
|
|
|
}
|
|
|
|
|
|
-/* This function prepares the portion of the RPCRDMA message to be
|
|
|
- * sent in the RDMA_SEND. This function is called after data sent via
|
|
|
- * RDMA has already been transmitted. There are three cases:
|
|
|
- * - The RPCRDMA header, RPC header, and payload are all sent in a
|
|
|
- * single RDMA_SEND. This is the "inline" case.
|
|
|
- * - The RPCRDMA header and some portion of the RPC header and data
|
|
|
- * are sent via this RDMA_SEND and another portion of the data is
|
|
|
- * sent via RDMA.
|
|
|
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
|
|
|
- * header and data are all transmitted via RDMA.
|
|
|
- * In all three cases, this function prepares the RPCRDMA header in
|
|
|
- * sge[0], the 'type' parameter indicates the type to place in the
|
|
|
- * RPCRDMA header, and the 'byte_count' field indicates how much of
|
|
|
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
|
|
|
- * to send is zero in the XDR.
|
|
|
+/* Prepare the portion of the RPC Reply that will be transmitted
|
|
|
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
|
|
|
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
|
|
|
+ *
|
|
|
+ * Depending on whether a Write list or Reply chunk is present,
|
|
|
+ * the server may send all, a portion of, or none of the xdr_buf.
|
|
|
+ * In the latter case, only the transport header (sge[0]) is
|
|
|
+ * transmitted.
|
|
|
+ *
|
|
|
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
|
|
|
+ * involved in the earlier RDMA Writes are here transferred out
|
|
|
+ * of the rqstp and into the ctxt's page array. These pages are
|
|
|
+ * DMA unmapped by each Write completion, but the subsequent Send
|
|
|
+ * completion finally releases these pages.
|
|
|
+ *
|
|
|
+ * Assumptions:
|
|
|
+ * - The Reply's transport header will never be larger than a page.
|
|
|
*/
|
|
|
-static int send_reply(struct svcxprt_rdma *rdma,
|
|
|
- struct svc_rqst *rqstp,
|
|
|
- struct page *page,
|
|
|
- struct rpcrdma_msg *rdma_resp,
|
|
|
- struct svc_rdma_req_map *vec,
|
|
|
- int byte_count,
|
|
|
- u32 inv_rkey)
|
|
|
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
|
|
|
+ __be32 *rdma_argp, __be32 *rdma_resp,
|
|
|
+ struct svc_rqst *rqstp,
|
|
|
+ __be32 *wr_lst, __be32 *rp_ch)
|
|
|
{
|
|
|
struct svc_rdma_op_ctxt *ctxt;
|
|
|
- struct ib_send_wr send_wr;
|
|
|
- u32 xdr_off;
|
|
|
- int sge_no;
|
|
|
- int sge_bytes;
|
|
|
- int page_no;
|
|
|
- int pages;
|
|
|
- int ret = -EIO;
|
|
|
-
|
|
|
- /* Prepare the context */
|
|
|
+ u32 inv_rkey;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
|
|
|
+ (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
|
|
|
+ rqstp->rq_res.head[0].iov_len,
|
|
|
+ rqstp->rq_res.page_len,
|
|
|
+ rqstp->rq_res.tail[0].iov_len);
|
|
|
+
|
|
|
ctxt = svc_rdma_get_context(rdma);
|
|
|
- ctxt->direction = DMA_TO_DEVICE;
|
|
|
- ctxt->pages[0] = page;
|
|
|
- ctxt->count = 1;
|
|
|
|
|
|
- /* Prepare the SGE for the RPCRDMA Header */
|
|
|
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
|
|
|
- ctxt->sge[0].length =
|
|
|
- svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
|
|
|
- ctxt->sge[0].addr =
|
|
|
- ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
|
|
|
- ctxt->sge[0].length, DMA_TO_DEVICE);
|
|
|
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
|
|
|
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
|
|
|
+ svc_rdma_reply_hdr_len(rdma_resp));
|
|
|
+ if (ret < 0)
|
|
|
goto err;
|
|
|
- svc_rdma_count_mappings(rdma, ctxt);
|
|
|
-
|
|
|
- ctxt->direction = DMA_TO_DEVICE;
|
|
|
|
|
|
- /* Map the payload indicated by 'byte_count' */
|
|
|
- xdr_off = 0;
|
|
|
- for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
|
|
|
- sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
|
|
|
- byte_count -= sge_bytes;
|
|
|
- ctxt->sge[sge_no].addr =
|
|
|
- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
|
|
|
- sge_bytes, DMA_TO_DEVICE);
|
|
|
- xdr_off += sge_bytes;
|
|
|
- if (ib_dma_mapping_error(rdma->sc_cm_id->device,
|
|
|
- ctxt->sge[sge_no].addr))
|
|
|
+ if (!rp_ch) {
|
|
|
+ ret = svc_rdma_map_reply_msg(rdma, ctxt,
|
|
|
+ &rqstp->rq_res, wr_lst);
|
|
|
+ if (ret < 0)
|
|
|
goto err;
|
|
|
- svc_rdma_count_mappings(rdma, ctxt);
|
|
|
- ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
|
|
|
- ctxt->sge[sge_no].length = sge_bytes;
|
|
|
}
|
|
|
- if (byte_count != 0) {
|
|
|
- pr_err("svcrdma: Could not map %d bytes\n", byte_count);
|
|
|
+
|
|
|
+ svc_rdma_save_io_pages(rqstp, ctxt);
|
|
|
+
|
|
|
+ inv_rkey = 0;
|
|
|
+ if (rdma->sc_snd_w_inv)
|
|
|
+ inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
|
|
|
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
|
|
|
+ if (ret)
|
|
|
goto err;
|
|
|
- }
|
|
|
|
|
|
- /* Save all respages in the ctxt and remove them from the
|
|
|
- * respages array. They are our pages until the I/O
|
|
|
- * completes.
|
|
|
+ return 0;
|
|
|
+
|
|
|
+err:
|
|
|
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
|
|
|
+ svc_rdma_unmap_dma(ctxt);
|
|
|
+ svc_rdma_put_context(ctxt, 1);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/* Given the client-provided Write and Reply chunks, the server was not
|
|
|
+ * able to form a complete reply. Return an RDMA_ERROR message so the
|
|
|
+ * client can retire this RPC transaction. As above, the Send completion
|
|
|
+ * routine releases payload pages that were part of a previous RDMA Write.
|
|
|
+ *
|
|
|
+ * Remote Invalidation is skipped for simplicity.
|
|
|
+ */
|
|
|
+static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
|
|
|
+ __be32 *rdma_resp, struct svc_rqst *rqstp)
|
|
|
+{
|
|
|
+ struct svc_rdma_op_ctxt *ctxt;
|
|
|
+ __be32 *p;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ ctxt = svc_rdma_get_context(rdma);
|
|
|
+
|
|
|
+ /* Replace the original transport header with an
|
|
|
+ * RDMA_ERROR response. XID etc are preserved.
|
|
|
*/
|
|
|
- pages = rqstp->rq_next_page - rqstp->rq_respages;
|
|
|
- for (page_no = 0; page_no < pages; page_no++) {
|
|
|
- ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
|
|
|
- ctxt->count++;
|
|
|
- rqstp->rq_respages[page_no] = NULL;
|
|
|
- }
|
|
|
- rqstp->rq_next_page = rqstp->rq_respages + 1;
|
|
|
+ p = rdma_resp + 3;
|
|
|
+ *p++ = rdma_error;
|
|
|
+ *p = err_chunk;
|
|
|
|
|
|
- if (sge_no > rdma->sc_max_sge) {
|
|
|
- pr_err("svcrdma: Too many sges (%d)\n", sge_no);
|
|
|
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
|
|
|
+ if (ret < 0)
|
|
|
goto err;
|
|
|
- }
|
|
|
- memset(&send_wr, 0, sizeof send_wr);
|
|
|
- ctxt->cqe.done = svc_rdma_wc_send;
|
|
|
- send_wr.wr_cqe = &ctxt->cqe;
|
|
|
- send_wr.sg_list = ctxt->sge;
|
|
|
- send_wr.num_sge = sge_no;
|
|
|
- if (inv_rkey) {
|
|
|
- send_wr.opcode = IB_WR_SEND_WITH_INV;
|
|
|
- send_wr.ex.invalidate_rkey = inv_rkey;
|
|
|
- } else
|
|
|
- send_wr.opcode = IB_WR_SEND;
|
|
|
- send_wr.send_flags = IB_SEND_SIGNALED;
|
|
|
|
|
|
- ret = svc_rdma_send(rdma, &send_wr);
|
|
|
+ svc_rdma_save_io_pages(rqstp, ctxt);
|
|
|
+
|
|
|
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
|
|
|
if (ret)
|
|
|
goto err;
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
- err:
|
|
|
+err:
|
|
|
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
|
|
|
svc_rdma_unmap_dma(ctxt);
|
|
|
svc_rdma_put_context(ctxt, 1);
|
|
|
return ret;
|
|
@@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
|
|
|
{
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * svc_rdma_sendto - Transmit an RPC reply
|
|
|
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
|
|
|
+ *
|
|
|
+ * Any resources still associated with @rqstp are released upon return.
|
|
|
+ * If no reply message was possible, the connection is closed.
|
|
|
+ *
|
|
|
+ * Returns:
|
|
|
+ * %0 if an RPC reply has been successfully posted,
|
|
|
+ * %-ENOMEM if a resource shortage occurred (connection is lost),
|
|
|
+ * %-ENOTCONN if posting failed (connection is lost).
|
|
|
+ */
|
|
|
int svc_rdma_sendto(struct svc_rqst *rqstp)
|
|
|
{
|
|
|
struct svc_xprt *xprt = rqstp->rq_xprt;
|
|
|
struct svcxprt_rdma *rdma =
|
|
|
container_of(xprt, struct svcxprt_rdma, sc_xprt);
|
|
|
- struct rpcrdma_msg *rdma_argp;
|
|
|
- struct rpcrdma_msg *rdma_resp;
|
|
|
- struct rpcrdma_write_array *wr_ary, *rp_ary;
|
|
|
- int ret;
|
|
|
- int inline_bytes;
|
|
|
+ __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
|
|
|
+ struct xdr_buf *xdr = &rqstp->rq_res;
|
|
|
struct page *res_page;
|
|
|
- struct svc_rdma_req_map *vec;
|
|
|
- u32 inv_rkey;
|
|
|
- __be32 *p;
|
|
|
-
|
|
|
- dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
|
|
|
+ int ret;
|
|
|
|
|
|
- /* Get the RDMA request header. The receive logic always
|
|
|
- * places this at the start of page 0.
|
|
|
+ /* Find the call's chunk lists to decide how to send the reply.
|
|
|
+ * Receive places the Call's xprt header at the start of page 0.
|
|
|
*/
|
|
|
rdma_argp = page_address(rqstp->rq_pages[0]);
|
|
|
- svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
|
|
|
-
|
|
|
- inv_rkey = 0;
|
|
|
- if (rdma->sc_snd_w_inv)
|
|
|
- inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
|
|
|
+ svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
|
|
|
|
|
|
- /* Build an req vec for the XDR */
|
|
|
- vec = svc_rdma_get_req_map(rdma);
|
|
|
- ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
|
|
|
- if (ret)
|
|
|
- goto err0;
|
|
|
- inline_bytes = rqstp->rq_res.len;
|
|
|
+ dprintk("svcrdma: preparing response for XID 0x%08x\n",
|
|
|
+ be32_to_cpup(rdma_argp));
|
|
|
|
|
|
/* Create the RDMA response header. xprt->xpt_mutex,
|
|
|
* acquired in svc_send(), serializes RPC replies. The
|
|
@@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
|
|
|
goto err0;
|
|
|
rdma_resp = page_address(res_page);
|
|
|
|
|
|
- p = &rdma_resp->rm_xid;
|
|
|
- *p++ = rdma_argp->rm_xid;
|
|
|
- *p++ = rdma_argp->rm_vers;
|
|
|
+ p = rdma_resp;
|
|
|
+ *p++ = *rdma_argp;
|
|
|
+ *p++ = *(rdma_argp + 1);
|
|
|
*p++ = rdma->sc_fc_credits;
|
|
|
- *p++ = rp_ary ? rdma_nomsg : rdma_msg;
|
|
|
+ *p++ = rp_ch ? rdma_nomsg : rdma_msg;
|
|
|
|
|
|
/* Start with empty chunks */
|
|
|
*p++ = xdr_zero;
|
|
|
*p++ = xdr_zero;
|
|
|
*p = xdr_zero;
|
|
|
|
|
|
- /* Send any write-chunk data and build resp write-list */
|
|
|
- if (wr_ary) {
|
|
|
- ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
|
|
|
+ if (wr_lst) {
|
|
|
+ /* XXX: Presume the client sent only one Write chunk */
|
|
|
+ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
|
|
|
if (ret < 0)
|
|
|
- goto err1;
|
|
|
- inline_bytes -= ret + xdr_padsize(ret);
|
|
|
+ goto err2;
|
|
|
+ svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
|
|
|
}
|
|
|
-
|
|
|
- /* Send any reply-list data and update resp reply-list */
|
|
|
- if (rp_ary) {
|
|
|
- ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
|
|
|
+ if (rp_ch) {
|
|
|
+ ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
|
|
|
if (ret < 0)
|
|
|
- goto err1;
|
|
|
- inline_bytes -= ret;
|
|
|
+ goto err2;
|
|
|
+ svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
|
|
|
}
|
|
|
|
|
|
- /* Post a fresh Receive buffer _before_ sending the reply */
|
|
|
ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
|
|
|
if (ret)
|
|
|
goto err1;
|
|
|
-
|
|
|
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
|
|
|
- inline_bytes, inv_rkey);
|
|
|
+ ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
|
|
|
+ wr_lst, rp_ch);
|
|
|
if (ret < 0)
|
|
|
goto err0;
|
|
|
+ return 0;
|
|
|
|
|
|
- svc_rdma_put_req_map(rdma, vec);
|
|
|
- dprintk("svcrdma: send_reply returns %d\n", ret);
|
|
|
- return ret;
|
|
|
+ err2:
|
|
|
+ if (ret != -E2BIG)
|
|
|
+ goto err1;
|
|
|
+
|
|
|
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
|
|
|
+ if (ret)
|
|
|
+ goto err1;
|
|
|
+ ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
|
|
|
+ if (ret < 0)
|
|
|
+ goto err0;
|
|
|
+ return 0;
|
|
|
|
|
|
err1:
|
|
|
put_page(res_page);
|
|
|
err0:
|
|
|
- svc_rdma_put_req_map(rdma, vec);
|
|
|
pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
|
|
|
ret);
|
|
|
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
|
|
|
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
|
|
|
return -ENOTCONN;
|
|
|
}
|
|
|
-
|
|
|
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
|
|
|
- int status)
|
|
|
-{
|
|
|
- struct ib_send_wr err_wr;
|
|
|
- struct page *p;
|
|
|
- struct svc_rdma_op_ctxt *ctxt;
|
|
|
- enum rpcrdma_errcode err;
|
|
|
- __be32 *va;
|
|
|
- int length;
|
|
|
- int ret;
|
|
|
-
|
|
|
- ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
|
|
|
- if (ret)
|
|
|
- return;
|
|
|
-
|
|
|
- p = alloc_page(GFP_KERNEL);
|
|
|
- if (!p)
|
|
|
- return;
|
|
|
- va = page_address(p);
|
|
|
-
|
|
|
- /* XDR encode an error reply */
|
|
|
- err = ERR_CHUNK;
|
|
|
- if (status == -EPROTONOSUPPORT)
|
|
|
- err = ERR_VERS;
|
|
|
- length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
|
|
|
-
|
|
|
- ctxt = svc_rdma_get_context(xprt);
|
|
|
- ctxt->direction = DMA_TO_DEVICE;
|
|
|
- ctxt->count = 1;
|
|
|
- ctxt->pages[0] = p;
|
|
|
-
|
|
|
- /* Prepare SGE for local address */
|
|
|
- ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
|
|
|
- ctxt->sge[0].length = length;
|
|
|
- ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
|
|
|
- p, 0, length, DMA_TO_DEVICE);
|
|
|
- if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
|
|
|
- dprintk("svcrdma: Error mapping buffer for protocol error\n");
|
|
|
- svc_rdma_put_context(ctxt, 1);
|
|
|
- return;
|
|
|
- }
|
|
|
- svc_rdma_count_mappings(xprt, ctxt);
|
|
|
-
|
|
|
- /* Prepare SEND WR */
|
|
|
- memset(&err_wr, 0, sizeof(err_wr));
|
|
|
- ctxt->cqe.done = svc_rdma_wc_send;
|
|
|
- err_wr.wr_cqe = &ctxt->cqe;
|
|
|
- err_wr.sg_list = ctxt->sge;
|
|
|
- err_wr.num_sge = 1;
|
|
|
- err_wr.opcode = IB_WR_SEND;
|
|
|
- err_wr.send_flags = IB_SEND_SIGNALED;
|
|
|
-
|
|
|
- /* Post It */
|
|
|
- ret = svc_rdma_send(xprt, &err_wr);
|
|
|
- if (ret) {
|
|
|
- dprintk("svcrdma: Error %d posting send for protocol error\n",
|
|
|
- ret);
|
|
|
- svc_rdma_unmap_dma(ctxt);
|
|
|
- svc_rdma_put_context(ctxt, 1);
|
|
|
- }
|
|
|
-}
|