浏览代码

Merge tag 'nfs-rdma-for-4.16-1' of git://git.linux-nfs.org/projects/anna/linux-nfs

NFS-over-RDMA client updates for Linux 4.16

New features:
- xprtrdma tracepoints

Bugfixes and cleanups:
- Fix memory leak if rpcrdma_buffer_create() fails
- Fix allocating extra rpcrdma_reps for the backchannel
- Remove various unused and redundant variables and lock cycles
- Fix IPv6 support in xprt_rdma_set_port()
- Fix memory leak by calling buf_free for callback replies
- Fix "bytes registered" accounting
- Fix kernel-doc comments
- SUNRPC tracepoint cleanups for consistent information
- Optimizations for __rpc_execute()
Trond Myklebust 7 年之前
父节点
当前提交
8f39fce84a

+ 1 - 1
include/linux/sunrpc/xprtrdma.h

@@ -64,7 +64,7 @@ enum rpcrdma_memreg {
 	RPCRDMA_MEMWINDOWS,
 	RPCRDMA_MEMWINDOWS_ASYNC,
 	RPCRDMA_MTHCAFMR,
-	RPCRDMA_FRMR,
+	RPCRDMA_FRWR,
 	RPCRDMA_ALLPHYSICAL,
 	RPCRDMA_LAST
 };

+ 129 - 0
include/trace/events/rdma.h

@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2017 Oracle.  All rights reserved.
+ */
+
+/*
+ * enum ib_event_type, from include/rdma/ib_verbs.h
+ */
+
+#define IB_EVENT_LIST				\
+	ib_event(CQ_ERR)			\
+	ib_event(QP_FATAL)			\
+	ib_event(QP_REQ_ERR)			\
+	ib_event(QP_ACCESS_ERR)			\
+	ib_event(COMM_EST)			\
+	ib_event(SQ_DRAINED)			\
+	ib_event(PATH_MIG)			\
+	ib_event(PATH_MIG_ERR)			\
+	ib_event(DEVICE_FATAL)			\
+	ib_event(PORT_ACTIVE)			\
+	ib_event(PORT_ERR)			\
+	ib_event(LID_CHANGE)			\
+	ib_event(PKEY_CHANGE)			\
+	ib_event(SM_CHANGE)			\
+	ib_event(SRQ_ERR)			\
+	ib_event(SRQ_LIMIT_REACHED)		\
+	ib_event(QP_LAST_WQE_REACHED)		\
+	ib_event(CLIENT_REREGISTER)		\
+	ib_event(GID_CHANGE)			\
+	ib_event_end(WQ_FATAL)
+
+#undef ib_event
+#undef ib_event_end
+
+#define ib_event(x)		TRACE_DEFINE_ENUM(IB_EVENT_##x);
+#define ib_event_end(x)		TRACE_DEFINE_ENUM(IB_EVENT_##x);
+
+IB_EVENT_LIST
+
+#undef ib_event
+#undef ib_event_end
+
+#define ib_event(x)		{ IB_EVENT_##x, #x },
+#define ib_event_end(x)		{ IB_EVENT_##x, #x }
+
+#define rdma_show_ib_event(x) \
+		__print_symbolic(x, IB_EVENT_LIST)
+
+/*
+ * enum ib_wc_status type, from include/rdma/ib_verbs.h
+ */
+#define IB_WC_STATUS_LIST			\
+	ib_wc_status(SUCCESS)			\
+	ib_wc_status(LOC_LEN_ERR)		\
+	ib_wc_status(LOC_QP_OP_ERR)		\
+	ib_wc_status(LOC_EEC_OP_ERR)		\
+	ib_wc_status(LOC_PROT_ERR)		\
+	ib_wc_status(WR_FLUSH_ERR)		\
+	ib_wc_status(MW_BIND_ERR)		\
+	ib_wc_status(BAD_RESP_ERR)		\
+	ib_wc_status(LOC_ACCESS_ERR)		\
+	ib_wc_status(REM_INV_REQ_ERR)		\
+	ib_wc_status(REM_ACCESS_ERR)		\
+	ib_wc_status(REM_OP_ERR)		\
+	ib_wc_status(RETRY_EXC_ERR)		\
+	ib_wc_status(RNR_RETRY_EXC_ERR)		\
+	ib_wc_status(LOC_RDD_VIOL_ERR)		\
+	ib_wc_status(REM_INV_RD_REQ_ERR)	\
+	ib_wc_status(REM_ABORT_ERR)		\
+	ib_wc_status(INV_EECN_ERR)		\
+	ib_wc_status(INV_EEC_STATE_ERR)		\
+	ib_wc_status(FATAL_ERR)			\
+	ib_wc_status(RESP_TIMEOUT_ERR)		\
+	ib_wc_status_end(GENERAL_ERR)
+
+#undef ib_wc_status
+#undef ib_wc_status_end
+
+#define ib_wc_status(x)		TRACE_DEFINE_ENUM(IB_WC_##x);
+#define ib_wc_status_end(x)	TRACE_DEFINE_ENUM(IB_WC_##x);
+
+IB_WC_STATUS_LIST
+
+#undef ib_wc_status
+#undef ib_wc_status_end
+
+#define ib_wc_status(x)		{ IB_WC_##x, #x },
+#define ib_wc_status_end(x)	{ IB_WC_##x, #x }
+
+#define rdma_show_wc_status(x) \
+		__print_symbolic(x, IB_WC_STATUS_LIST)
+
+/*
+ * enum rdma_cm_event_type, from include/rdma/rdma_cm.h
+ */
+#define RDMA_CM_EVENT_LIST			\
+	rdma_cm_event(ADDR_RESOLVED)		\
+	rdma_cm_event(ADDR_ERROR)		\
+	rdma_cm_event(ROUTE_RESOLVED)		\
+	rdma_cm_event(ROUTE_ERROR)		\
+	rdma_cm_event(CONNECT_REQUEST)		\
+	rdma_cm_event(CONNECT_RESPONSE)		\
+	rdma_cm_event(CONNECT_ERROR)		\
+	rdma_cm_event(UNREACHABLE)		\
+	rdma_cm_event(REJECTED)			\
+	rdma_cm_event(ESTABLISHED)		\
+	rdma_cm_event(DISCONNECTED)		\
+	rdma_cm_event(DEVICE_REMOVAL)		\
+	rdma_cm_event(MULTICAST_JOIN)		\
+	rdma_cm_event(MULTICAST_ERROR)		\
+	rdma_cm_event(ADDR_CHANGE)		\
+	rdma_cm_event_end(TIMEWAIT_EXIT)
+
+#undef rdma_cm_event
+#undef rdma_cm_event_end
+
+#define rdma_cm_event(x)	TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x);
+#define rdma_cm_event_end(x)	TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x);
+
+RDMA_CM_EVENT_LIST
+
+#undef rdma_cm_event
+#undef rdma_cm_event_end
+
+#define rdma_cm_event(x)	{ RDMA_CM_EVENT_##x, #x },
+#define rdma_cm_event_end(x)	{ RDMA_CM_EVENT_##x, #x }
+
+#define rdma_show_cm_event(x) \
+		__print_symbolic(x, RDMA_CM_EVENT_LIST)

+ 890 - 0
include/trace/events/rpcrdma.h

@@ -0,0 +1,890 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2017 Oracle.  All rights reserved.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rpcrdma
+
+#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RPCRDMA_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/rdma.h>
+
+/**
+ ** Event classes
+ **/
+
+DECLARE_EVENT_CLASS(xprtrdma_reply_event,
+	TP_PROTO(
+		const struct rpcrdma_rep *rep
+	),
+
+	TP_ARGS(rep),
+
+	TP_STRUCT__entry(
+		__field(const void *, rep)
+		__field(const void *, r_xprt)
+		__field(u32, xid)
+		__field(u32, version)
+		__field(u32, proc)
+	),
+
+	TP_fast_assign(
+		__entry->rep = rep;
+		__entry->r_xprt = rep->rr_rxprt;
+		__entry->xid = be32_to_cpu(rep->rr_xid);
+		__entry->version = be32_to_cpu(rep->rr_vers);
+		__entry->proc = be32_to_cpu(rep->rr_proc);
+	),
+
+	TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u",
+		__entry->r_xprt, __entry->xid, __entry->rep,
+		__entry->version, __entry->proc
+	)
+);
+
+#define DEFINE_REPLY_EVENT(name)					\
+		DEFINE_EVENT(xprtrdma_reply_event, name,		\
+				TP_PROTO(				\
+					const struct rpcrdma_rep *rep	\
+				),					\
+				TP_ARGS(rep))
+
+DECLARE_EVENT_CLASS(xprtrdma_rxprt,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt
+	),
+
+	TP_ARGS(r_xprt),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p",
+		__get_str(addr), __get_str(port), __entry->r_xprt
+	)
+);
+
+#define DEFINE_RXPRT_EVENT(name)					\
+		DEFINE_EVENT(xprtrdma_rxprt, name,			\
+				TP_PROTO(				\
+					const struct rpcrdma_xprt *r_xprt \
+				),					\
+				TP_ARGS(r_xprt))
+
+DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
+	TP_PROTO(
+		const struct rpc_task *task,
+		unsigned int pos,
+		struct rpcrdma_mr *mr,
+		int nsegs
+	),
+
+	TP_ARGS(task, pos, mr, nsegs),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, mr)
+		__field(unsigned int, pos)
+		__field(int, nents)
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+		__field(int, nsegs)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->mr = mr;
+		__entry->pos = pos;
+		__entry->nents = mr->mr_nents;
+		__entry->handle = mr->mr_handle;
+		__entry->length = mr->mr_length;
+		__entry->offset = mr->mr_offset;
+		__entry->nsegs = nsegs;
+	),
+
+	TP_printk("task:%u@%u mr=%p pos=%u %u@0x%016llx:0x%08x (%s)",
+		__entry->task_id, __entry->client_id, __entry->mr,
+		__entry->pos, __entry->length,
+		(unsigned long long)__entry->offset, __entry->handle,
+		__entry->nents < __entry->nsegs ? "more" : "last"
+	)
+);
+
+#define DEFINE_RDCH_EVENT(name)						\
+		DEFINE_EVENT(xprtrdma_rdch_event, name,			\
+				TP_PROTO(				\
+					const struct rpc_task *task,	\
+					unsigned int pos,		\
+					struct rpcrdma_mr *mr,		\
+					int nsegs			\
+				),					\
+				TP_ARGS(task, pos, mr, nsegs))
+
+DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
+	TP_PROTO(
+		const struct rpc_task *task,
+		struct rpcrdma_mr *mr,
+		int nsegs
+	),
+
+	TP_ARGS(task, mr, nsegs),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, mr)
+		__field(int, nents)
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+		__field(int, nsegs)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->mr = mr;
+		__entry->nents = mr->mr_nents;
+		__entry->handle = mr->mr_handle;
+		__entry->length = mr->mr_length;
+		__entry->offset = mr->mr_offset;
+		__entry->nsegs = nsegs;
+	),
+
+	TP_printk("task:%u@%u mr=%p %u@0x%016llx:0x%08x (%s)",
+		__entry->task_id, __entry->client_id, __entry->mr,
+		__entry->length, (unsigned long long)__entry->offset,
+		__entry->handle,
+		__entry->nents < __entry->nsegs ? "more" : "last"
+	)
+);
+
+#define DEFINE_WRCH_EVENT(name)						\
+		DEFINE_EVENT(xprtrdma_wrch_event, name,			\
+				TP_PROTO(				\
+					const struct rpc_task *task,	\
+					struct rpcrdma_mr *mr,		\
+					int nsegs			\
+				),					\
+				TP_ARGS(task, mr, nsegs))
+
+TRACE_DEFINE_ENUM(FRWR_IS_INVALID);
+TRACE_DEFINE_ENUM(FRWR_IS_VALID);
+TRACE_DEFINE_ENUM(FRWR_FLUSHED_FR);
+TRACE_DEFINE_ENUM(FRWR_FLUSHED_LI);
+
+#define xprtrdma_show_frwr_state(x)					\
+		__print_symbolic(x,					\
+				{ FRWR_IS_INVALID, "INVALID" },		\
+				{ FRWR_IS_VALID, "VALID" },		\
+				{ FRWR_FLUSHED_FR, "FLUSHED_FR" },	\
+				{ FRWR_FLUSHED_LI, "FLUSHED_LI" })
+
+DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
+	TP_PROTO(
+		const struct ib_wc *wc,
+		const struct rpcrdma_frwr *frwr
+	),
+
+	TP_ARGS(wc, frwr),
+
+	TP_STRUCT__entry(
+		__field(const void *, mr)
+		__field(unsigned int, state)
+		__field(unsigned int, status)
+		__field(unsigned int, vendor_err)
+	),
+
+	TP_fast_assign(
+		__entry->mr = container_of(frwr, struct rpcrdma_mr, frwr);
+		__entry->state = frwr->fr_state;
+		__entry->status = wc->status;
+		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
+	),
+
+	TP_printk(
+		"mr=%p state=%s: %s (%u/0x%x)",
+		__entry->mr, xprtrdma_show_frwr_state(__entry->state),
+		rdma_show_wc_status(__entry->status),
+		__entry->status, __entry->vendor_err
+	)
+);
+
+#define DEFINE_FRWR_DONE_EVENT(name)					\
+		DEFINE_EVENT(xprtrdma_frwr_done, name,			\
+				TP_PROTO(				\
+					const struct ib_wc *wc,		\
+					const struct rpcrdma_frwr *frwr	\
+				),					\
+				TP_ARGS(wc, frwr))
+
+DECLARE_EVENT_CLASS(xprtrdma_mr,
+	TP_PROTO(
+		const struct rpcrdma_mr *mr
+	),
+
+	TP_ARGS(mr),
+
+	TP_STRUCT__entry(
+		__field(const void *, mr)
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->mr = mr;
+		__entry->handle = mr->mr_handle;
+		__entry->length = mr->mr_length;
+		__entry->offset = mr->mr_offset;
+	),
+
+	TP_printk("mr=%p %u@0x%016llx:0x%08x",
+		__entry->mr, __entry->length,
+		(unsigned long long)__entry->offset,
+		__entry->handle
+	)
+);
+
+#define DEFINE_MR_EVENT(name) \
+		DEFINE_EVENT(xprtrdma_mr, name, \
+				TP_PROTO( \
+					const struct rpcrdma_mr *mr \
+				), \
+				TP_ARGS(mr))
+
+DECLARE_EVENT_CLASS(xprtrdma_cb_event,
+	TP_PROTO(
+		const struct rpc_rqst *rqst
+	),
+
+	TP_ARGS(rqst),
+
+	TP_STRUCT__entry(
+		__field(const void *, rqst)
+		__field(const void *, rep)
+		__field(const void *, req)
+		__field(u32, xid)
+	),
+
+	TP_fast_assign(
+		__entry->rqst = rqst;
+		__entry->req = rpcr_to_rdmar(rqst);
+		__entry->rep = rpcr_to_rdmar(rqst)->rl_reply;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+	),
+
+	TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p",
+		__entry->xid, __entry->rqst, __entry->req, __entry->rep
+	)
+);
+
+#define DEFINE_CB_EVENT(name)						\
+		DEFINE_EVENT(xprtrdma_cb_event, name,			\
+				TP_PROTO(				\
+					const struct rpc_rqst *rqst	\
+				),					\
+				TP_ARGS(rqst))
+
+/**
+ ** Connection events
+ **/
+
+TRACE_EVENT(xprtrdma_conn_upcall,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		struct rdma_cm_event *event
+	),
+
+	TP_ARGS(r_xprt, event),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned int, event)
+		__field(int, status)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->event = event->event;
+		__entry->status = event->status;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)",
+		__get_str(addr), __get_str(port),
+		__entry->r_xprt, rdma_show_cm_event(__entry->event),
+		__entry->event, __entry->status
+	)
+);
+
+TRACE_EVENT(xprtrdma_disconnect,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		int status
+	),
+
+	TP_ARGS(r_xprt, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(int, status)
+		__field(int, connected)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->status = status;
+		__entry->connected = r_xprt->rx_ep.rep_connected;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: status=%d %sconnected",
+		__get_str(addr), __get_str(port),
+		__entry->r_xprt, __entry->status,
+		__entry->connected == 1 ? "still " : "dis"
+	)
+);
+
+DEFINE_RXPRT_EVENT(xprtrdma_conn_start);
+DEFINE_RXPRT_EVENT(xprtrdma_conn_tout);
+DEFINE_RXPRT_EVENT(xprtrdma_create);
+DEFINE_RXPRT_EVENT(xprtrdma_destroy);
+DEFINE_RXPRT_EVENT(xprtrdma_remove);
+DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
+DEFINE_RXPRT_EVENT(xprtrdma_reconnect);
+DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc);
+
+TRACE_EVENT(xprtrdma_qp_error,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		const struct ib_event *event
+	),
+
+	TP_ARGS(r_xprt, event),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned int, event)
+		__string(name, event->device->name)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->event = event->event;
+		__assign_str(name, event->device->name);
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)",
+		__get_str(addr), __get_str(port), __entry->r_xprt,
+		__get_str(name), rdma_show_ib_event(__entry->event),
+		__entry->event
+	)
+);
+
+/**
+ ** Call events
+ **/
+
+TRACE_EVENT(xprtrdma_createmrs,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		unsigned int count
+	),
+
+	TP_ARGS(r_xprt, count),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned int, count)
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->count = count;
+	),
+
+	TP_printk("r_xprt=%p: created %u MRs",
+		__entry->r_xprt, __entry->count
+	)
+);
+
+DEFINE_RXPRT_EVENT(xprtrdma_nomrs);
+
+DEFINE_RDCH_EVENT(xprtrdma_read_chunk);
+DEFINE_WRCH_EVENT(xprtrdma_write_chunk);
+DEFINE_WRCH_EVENT(xprtrdma_reply_chunk);
+
+TRACE_DEFINE_ENUM(rpcrdma_noch);
+TRACE_DEFINE_ENUM(rpcrdma_readch);
+TRACE_DEFINE_ENUM(rpcrdma_areadch);
+TRACE_DEFINE_ENUM(rpcrdma_writech);
+TRACE_DEFINE_ENUM(rpcrdma_replych);
+
+#define xprtrdma_show_chunktype(x)					\
+		__print_symbolic(x,					\
+				{ rpcrdma_noch, "inline" },		\
+				{ rpcrdma_readch, "read list" },	\
+				{ rpcrdma_areadch, "*read list" },	\
+				{ rpcrdma_writech, "write list" },	\
+				{ rpcrdma_replych, "reply chunk" })
+
+TRACE_EVENT(xprtrdma_marshal,
+	TP_PROTO(
+		const struct rpc_rqst *rqst,
+		unsigned int hdrlen,
+		unsigned int rtype,
+		unsigned int wtype
+	),
+
+	TP_ARGS(rqst, hdrlen, rtype, wtype),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(unsigned int, hdrlen)
+		__field(unsigned int, headlen)
+		__field(unsigned int, pagelen)
+		__field(unsigned int, taillen)
+		__field(unsigned int, rtype)
+		__field(unsigned int, wtype)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->hdrlen = hdrlen;
+		__entry->headlen = rqst->rq_snd_buf.head[0].iov_len;
+		__entry->pagelen = rqst->rq_snd_buf.page_len;
+		__entry->taillen = rqst->rq_snd_buf.tail[0].iov_len;
+		__entry->rtype = rtype;
+		__entry->wtype = wtype;
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x: hdr=%u xdr=%u/%u/%u %s/%s",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->hdrlen,
+		__entry->headlen, __entry->pagelen, __entry->taillen,
+		xprtrdma_show_chunktype(__entry->rtype),
+		xprtrdma_show_chunktype(__entry->wtype)
+	)
+);
+
+TRACE_EVENT(xprtrdma_post_send,
+	TP_PROTO(
+		const struct rpcrdma_req *req,
+		int status
+	),
+
+	TP_ARGS(req, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(int, num_sge)
+		__field(bool, signaled)
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->num_sge = req->rl_sendctx->sc_wr.num_sge;
+		__entry->signaled = req->rl_sendctx->sc_wr.send_flags &
+				    IB_SEND_SIGNALED;
+		__entry->status = status;
+	),
+
+	TP_printk("req=%p, %d SGEs%s, status=%d",
+		__entry->req, __entry->num_sge,
+		(__entry->signaled ? ", signaled" : ""),
+		__entry->status
+	)
+);
+
+TRACE_EVENT(xprtrdma_post_recv,
+	TP_PROTO(
+		const struct rpcrdma_rep *rep,
+		int status
+	),
+
+	TP_ARGS(rep, status),
+
+	TP_STRUCT__entry(
+		__field(const void *, rep)
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->rep = rep;
+		__entry->status = status;
+	),
+
+	TP_printk("rep=%p status=%d",
+		__entry->rep, __entry->status
+	)
+);
+
+/**
+ ** Completion events
+ **/
+
+TRACE_EVENT(xprtrdma_wc_send,
+	TP_PROTO(
+		const struct rpcrdma_sendctx *sc,
+		const struct ib_wc *wc
+	),
+
+	TP_ARGS(sc, wc),
+
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(unsigned int, unmap_count)
+		__field(unsigned int, status)
+		__field(unsigned int, vendor_err)
+	),
+
+	TP_fast_assign(
+		__entry->req = sc->sc_req;
+		__entry->unmap_count = sc->sc_unmap_count;
+		__entry->status = wc->status;
+		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
+	),
+
+	TP_printk("req=%p, unmapped %u pages: %s (%u/0x%x)",
+		__entry->req, __entry->unmap_count,
+		rdma_show_wc_status(__entry->status),
+		__entry->status, __entry->vendor_err
+	)
+);
+
+TRACE_EVENT(xprtrdma_wc_receive,
+	TP_PROTO(
+		const struct rpcrdma_rep *rep,
+		const struct ib_wc *wc
+	),
+
+	TP_ARGS(rep, wc),
+
+	TP_STRUCT__entry(
+		__field(const void *, rep)
+		__field(unsigned int, byte_len)
+		__field(unsigned int, status)
+		__field(unsigned int, vendor_err)
+	),
+
+	TP_fast_assign(
+		__entry->rep = rep;
+		__entry->byte_len = wc->byte_len;
+		__entry->status = wc->status;
+		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
+	),
+
+	TP_printk("rep=%p, %u bytes: %s (%u/0x%x)",
+		__entry->rep, __entry->byte_len,
+		rdma_show_wc_status(__entry->status),
+		__entry->status, __entry->vendor_err
+	)
+);
+
+DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
+DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
+DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
+
+DEFINE_MR_EVENT(xprtrdma_localinv);
+DEFINE_MR_EVENT(xprtrdma_dma_unmap);
+DEFINE_MR_EVENT(xprtrdma_remoteinv);
+DEFINE_MR_EVENT(xprtrdma_recover_mr);
+
+/**
+ ** Reply events
+ **/
+
+TRACE_EVENT(xprtrdma_reply,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct rpcrdma_rep *rep,
+		const struct rpcrdma_req *req,
+		unsigned int credits
+	),
+
+	TP_ARGS(task, rep, req, credits),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, rep)
+		__field(const void *, req)
+		__field(u32, xid)
+		__field(unsigned int, credits)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->rep = rep;
+		__entry->req = req;
+		__entry->xid = be32_to_cpu(rep->rr_xid);
+		__entry->credits = credits;
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->credits, __entry->rep, __entry->req
+	)
+);
+
+TRACE_EVENT(xprtrdma_defer_cmp,
+	TP_PROTO(
+		const struct rpcrdma_rep *rep
+	),
+
+	TP_ARGS(rep),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, rep)
+		__field(u32, xid)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rep->rr_rqst->rq_task->tk_pid;
+		__entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid;
+		__entry->rep = rep;
+		__entry->xid = be32_to_cpu(rep->rr_xid);
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x rep=%p",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->rep
+	)
+);
+
+DEFINE_REPLY_EVENT(xprtrdma_reply_vers);
+DEFINE_REPLY_EVENT(xprtrdma_reply_rqst);
+DEFINE_REPLY_EVENT(xprtrdma_reply_short);
+DEFINE_REPLY_EVENT(xprtrdma_reply_hdr);
+
+TRACE_EVENT(xprtrdma_fixup,
+	TP_PROTO(
+		const struct rpc_rqst *rqst,
+		int len,
+		int hdrlen
+	),
+
+	TP_ARGS(rqst, len, hdrlen),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, base)
+		__field(int, len)
+		__field(int, hdrlen)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->base = rqst->rq_rcv_buf.head[0].iov_base;
+		__entry->len = len;
+		__entry->hdrlen = hdrlen;
+	),
+
+	TP_printk("task:%u@%u base=%p len=%d hdrlen=%d",
+		__entry->task_id, __entry->client_id,
+		__entry->base, __entry->len, __entry->hdrlen
+	)
+);
+
+TRACE_EVENT(xprtrdma_fixup_pg,
+	TP_PROTO(
+		const struct rpc_rqst *rqst,
+		int pageno,
+		const void *pos,
+		int len,
+		int curlen
+	),
+
+	TP_ARGS(rqst, pageno, pos, len, curlen),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, pos)
+		__field(int, pageno)
+		__field(int, len)
+		__field(int, curlen)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->pos = pos;
+		__entry->pageno = pageno;
+		__entry->len = len;
+		__entry->curlen = curlen;
+	),
+
+	TP_printk("task:%u@%u pageno=%d pos=%p len=%d curlen=%d",
+		__entry->task_id, __entry->client_id,
+		__entry->pageno, __entry->pos, __entry->len, __entry->curlen
+	)
+);
+
+TRACE_EVENT(xprtrdma_decode_seg,
+	TP_PROTO(
+		u32 handle,
+		u32 length,
+		u64 offset
+	),
+
+	TP_ARGS(handle, length, offset),
+
+	TP_STRUCT__entry(
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->handle = handle;
+		__entry->length = length;
+		__entry->offset = offset;
+	),
+
+	TP_printk("%u@0x%016llx:0x%08x",
+		__entry->length, (unsigned long long)__entry->offset,
+		__entry->handle
+	)
+);
+
+/**
+ ** Allocation/release of rpcrdma_reqs and rpcrdma_reps
+ **/
+
+TRACE_EVENT(xprtrdma_allocate,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct rpcrdma_req *req
+	),
+
+	TP_ARGS(task, req),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, req)
+		__field(const void *, rep)
+		__field(size_t, callsize)
+		__field(size_t, rcvsize)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->req = req;
+		__entry->rep = req ? req->rl_reply : NULL;
+		__entry->callsize = task->tk_rqstp->rq_callsize;
+		__entry->rcvsize = task->tk_rqstp->rq_rcvsize;
+	),
+
+	TP_printk("task:%u@%u req=%p rep=%p (%zu, %zu)",
+		__entry->task_id, __entry->client_id,
+		__entry->req, __entry->rep,
+		__entry->callsize, __entry->rcvsize
+	)
+);
+
+TRACE_EVENT(xprtrdma_rpc_done,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct rpcrdma_req *req
+	),
+
+	TP_ARGS(task, req),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, req)
+		__field(const void *, rep)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->req = req;
+		__entry->rep = req->rl_reply;
+	),
+
+	TP_printk("task:%u@%u req=%p rep=%p",
+		__entry->task_id, __entry->client_id,
+		__entry->req, __entry->rep
+	)
+);
+
+DEFINE_RXPRT_EVENT(xprtrdma_noreps);
+
+/**
+ ** Callback events
+ **/
+
+TRACE_EVENT(xprtrdma_cb_setup,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		unsigned int reqs
+	),
+
+	TP_ARGS(r_xprt, reqs),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned int, reqs)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->reqs = reqs;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs",
+		__get_str(addr), __get_str(port),
+		__entry->r_xprt, __entry->reqs
+	)
+);
+
+DEFINE_CB_EVENT(xprtrdma_cb_call);
+DEFINE_CB_EVENT(xprtrdma_cb_reply);
+
+#endif /* _TRACE_RPCRDMA_H */
+
+#include <trace/define_trace.h>

+ 6 - 2
include/trace/events/sunrpc.h

@@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(rpc_task_status,
 		__entry->status = task->tk_status;
 	),
 
-	TP_printk("task:%u@%u, status %d",
+	TP_printk("task:%u@%u status=%d",
 		__entry->task_id, __entry->client_id,
 		__entry->status)
 );
@@ -66,7 +66,7 @@ TRACE_EVENT(rpc_connect_status,
 		__entry->status = status;
 	),
 
-	TP_printk("task:%u@%u, status %d",
+	TP_printk("task:%u@%u status=%d",
 		__entry->task_id, __entry->client_id,
 		__entry->status)
 );
@@ -390,6 +390,10 @@ DECLARE_EVENT_CLASS(rpc_xprt_event,
 			__entry->status)
 );
 
+DEFINE_EVENT(rpc_xprt_event, xprt_timer,
+	TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status),
+	TP_ARGS(xprt, xid, status));
+
 DEFINE_EVENT(rpc_xprt_event, xprt_lookup_rqst,
 	TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status),
 	TP_ARGS(xprt, xid, status));

+ 12 - 14
net/sunrpc/sched.c

@@ -755,22 +755,20 @@ static void __rpc_execute(struct rpc_task *task)
 		void (*do_action)(struct rpc_task *);
 
 		/*
-		 * Execute any pending callback first.
+		 * Perform the next FSM step or a pending callback.
+		 *
+		 * tk_action may be NULL if the task has been killed.
+		 * In particular, note that rpc_killall_tasks may
+		 * do this at any time, so beware when dereferencing.
 		 */
-		do_action = task->tk_callback;
-		task->tk_callback = NULL;
-		if (do_action == NULL) {
-			/*
-			 * Perform the next FSM step.
-			 * tk_action may be NULL if the task has been killed.
-			 * In particular, note that rpc_killall_tasks may
-			 * do this at any time, so beware when dereferencing.
-			 */
-			do_action = task->tk_action;
-			if (do_action == NULL)
-				break;
+		do_action = task->tk_action;
+		if (task->tk_callback) {
+			do_action = task->tk_callback;
+			task->tk_callback = NULL;
 		}
-		trace_rpc_task_run_action(task->tk_client, task, task->tk_action);
+		if (!do_action)
+			break;
+		trace_rpc_task_run_action(task->tk_client, task, do_action);
 		do_action(task);
 
 		/*

+ 1 - 1
net/sunrpc/xprt.c

@@ -940,8 +940,8 @@ static void xprt_timer(struct rpc_task *task)
 
 	if (task->tk_status != -ETIMEDOUT)
 		return;
-	dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
 
+	trace_xprt_timer(xprt, req->rq_xid, task->tk_status);
 	if (!req->rq_reply_bytes_recvd) {
 		if (xprt->ops->timer)
 			xprt->ops->timer(xprt, task);

+ 46 - 32
net/sunrpc/xprtrdma/backchannel.c

@@ -43,7 +43,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
 	req = rpcrdma_create_req(r_xprt);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	__set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags);
 
 	rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
 				  DMA_TO_DEVICE, GFP_KERNEL);
@@ -74,21 +73,13 @@ out_fail:
 static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
 				 unsigned int count)
 {
-	struct rpcrdma_rep *rep;
 	int rc = 0;
 
 	while (count--) {
-		rep = rpcrdma_create_rep(r_xprt);
-		if (IS_ERR(rep)) {
-			pr_err("RPC:       %s: reply buffer alloc failed\n",
-			       __func__);
-			rc = PTR_ERR(rep);
+		rc = rpcrdma_create_rep(r_xprt);
+		if (rc)
 			break;
-		}
-
-		rpcrdma_recv_buffer_put(rep);
 	}
-
 	return rc;
 }
 
@@ -129,6 +120,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 		rqst->rq_xprt = &r_xprt->rx_xprt;
 		INIT_LIST_HEAD(&rqst->rq_list);
 		INIT_LIST_HEAD(&rqst->rq_bc_list);
+		__set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
 
 		if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
 			goto out_free;
@@ -148,7 +140,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 
 	buffer->rb_bc_srv_max_requests = reqs;
 	request_module("svcrdma");
-
+	trace_xprtrdma_cb_setup(r_xprt, reqs);
 	return 0;
 
 out_free:
@@ -196,13 +188,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
 	return maxmsg - RPCRDMA_HDRLEN_MIN;
 }
 
-/**
- * rpcrdma_bc_marshal_reply - Send backwards direction reply
- * @rqst: buffer containing RPC reply data
- *
- * Returns zero on success.
- */
-int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
+static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
@@ -226,7 +212,46 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
 				      &rqst->rq_snd_buf, rpcrdma_noch))
 		return -EIO;
+
+	trace_xprtrdma_cb_reply(rqst);
+	return 0;
+}
+
+/**
+ * xprt_rdma_bc_send_reply - marshal and send a backchannel reply
+ * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf
+ *
+ * Caller holds the transport's write lock.
+ *
+ * Returns:
+ *	%0 if the RPC message has been sent
+ *	%-ENOTCONN if the caller should reconnect and call again
+ *	%-EIO if a permanent error occurred and the request was not
+ *		sent. Do not try to send this message again.
+ */
+int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	int rc;
+
+	if (!xprt_connected(rqst->rq_xprt))
+		goto drop_connection;
+
+	rc = rpcrdma_bc_marshal_reply(rqst);
+	if (rc < 0)
+		goto failed_marshal;
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+		goto drop_connection;
 	return 0;
+
+failed_marshal:
+	if (rc != -ENOTCONN)
+		return rc;
+drop_connection:
+	xprt_disconnect_done(rqst->rq_xprt);
+	return -ENOTCONN;
 }
 
 /**
@@ -262,11 +287,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 	dprintk("RPC:       %s: freeing rqst %p (req %p)\n",
 		__func__, rqst, rpcr_to_rdmar(rqst));
 
-	smp_mb__before_atomic();
-	WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
-	clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
-	smp_mb__after_atomic();
-
 	spin_lock_bh(&xprt->bc_pa_lock);
 	list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
 	spin_unlock_bh(&xprt->bc_pa_lock);
@@ -274,7 +294,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 
 /**
  * rpcrdma_bc_receive_call - Handle a backward direction call
- * @xprt: transport receiving the call
+ * @r_xprt: transport receiving the call
  * @rep: receive buffer containing the call
  *
  * Operational assumptions:
@@ -313,7 +333,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 				struct rpc_rqst, rq_bc_pa_list);
 	list_del(&rqst->rq_bc_pa_list);
 	spin_unlock(&xprt->bc_pa_lock);
-	dprintk("RPC:       %s: using rqst %p\n", __func__, rqst);
 
 	/* Prepare rqst */
 	rqst->rq_reply_bytes_recvd = 0;
@@ -321,7 +340,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	rqst->rq_xid = *p;
 
 	rqst->rq_private_buf.len = size;
-	set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
 
 	buf = &rqst->rq_rcv_buf;
 	memset(buf, 0, sizeof(*buf));
@@ -335,12 +353,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	 * the Upper Layer is done decoding it.
 	 */
 	req = rpcr_to_rdmar(rqst);
-	dprintk("RPC:       %s: attaching rep %p to req %p\n",
-		__func__, rep, req);
 	req->rl_reply = rep;
-
-	/* Defeat the retransmit detection logic in send_request */
-	req->rl_connect_cookie = 0;
+	trace_xprtrdma_cb_call(rqst);
 
 	/* Queue rqst for ULP's callback service */
 	bc_serv = xprt->bc_serv;

+ 77 - 80
net/sunrpc/xprtrdma/fmr_ops.c

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  */
 
@@ -47,7 +47,7 @@ fmr_is_supported(struct rpcrdma_ia *ia)
 }
 
 static int
-fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 {
 	static struct ib_fmr_attr fmr_attr = {
 		.max_pages	= RPCRDMA_MAX_FMR_SGES,
@@ -55,106 +55,108 @@ fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
 		.page_shift	= PAGE_SHIFT
 	};
 
-	mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+	mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
 				       sizeof(u64), GFP_KERNEL);
-	if (!mw->fmr.fm_physaddrs)
+	if (!mr->fmr.fm_physaddrs)
 		goto out_free;
 
-	mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
-			    sizeof(*mw->mw_sg), GFP_KERNEL);
-	if (!mw->mw_sg)
+	mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+			    sizeof(*mr->mr_sg), GFP_KERNEL);
+	if (!mr->mr_sg)
 		goto out_free;
 
-	sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
+	sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
 
-	mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+	mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
 				     &fmr_attr);
-	if (IS_ERR(mw->fmr.fm_mr))
+	if (IS_ERR(mr->fmr.fm_mr))
 		goto out_fmr_err;
 
 	return 0;
 
 out_fmr_err:
 	dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
-		PTR_ERR(mw->fmr.fm_mr));
+		PTR_ERR(mr->fmr.fm_mr));
 
 out_free:
-	kfree(mw->mw_sg);
-	kfree(mw->fmr.fm_physaddrs);
+	kfree(mr->mr_sg);
+	kfree(mr->fmr.fm_physaddrs);
 	return -ENOMEM;
 }
 
 static int
-__fmr_unmap(struct rpcrdma_mw *mw)
+__fmr_unmap(struct rpcrdma_mr *mr)
 {
 	LIST_HEAD(l);
 	int rc;
 
-	list_add(&mw->fmr.fm_mr->list, &l);
+	list_add(&mr->fmr.fm_mr->list, &l);
 	rc = ib_unmap_fmr(&l);
-	list_del(&mw->fmr.fm_mr->list);
+	list_del(&mr->fmr.fm_mr->list);
 	return rc;
 }
 
 static void
-fmr_op_release_mr(struct rpcrdma_mw *r)
+fmr_op_release_mr(struct rpcrdma_mr *mr)
 {
 	LIST_HEAD(unmap_list);
 	int rc;
 
 	/* Ensure MW is not on any rl_registered list */
-	if (!list_empty(&r->mw_list))
-		list_del(&r->mw_list);
+	if (!list_empty(&mr->mr_list))
+		list_del(&mr->mr_list);
 
-	kfree(r->fmr.fm_physaddrs);
-	kfree(r->mw_sg);
+	kfree(mr->fmr.fm_physaddrs);
+	kfree(mr->mr_sg);
 
 	/* In case this one was left mapped, try to unmap it
 	 * to prevent dealloc_fmr from failing with EBUSY
 	 */
-	rc = __fmr_unmap(r);
+	rc = __fmr_unmap(mr);
 	if (rc)
 		pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
-		       r, rc);
+		       mr, rc);
 
-	rc = ib_dealloc_fmr(r->fmr.fm_mr);
+	rc = ib_dealloc_fmr(mr->fmr.fm_mr);
 	if (rc)
 		pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
-		       r, rc);
+		       mr, rc);
 
-	kfree(r);
+	kfree(mr);
 }
 
 /* Reset of a single FMR.
  */
 static void
-fmr_op_recover_mr(struct rpcrdma_mw *mw)
+fmr_op_recover_mr(struct rpcrdma_mr *mr)
 {
-	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 	int rc;
 
 	/* ORDER: invalidate first */
-	rc = __fmr_unmap(mw);
-
-	/* ORDER: then DMA unmap */
-	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
-			mw->mw_sg, mw->mw_nents, mw->mw_dir);
+	rc = __fmr_unmap(mr);
 	if (rc)
 		goto out_release;
 
-	rpcrdma_put_mw(r_xprt, mw);
+	/* ORDER: then DMA unmap */
+	rpcrdma_mr_unmap_and_put(mr);
+
 	r_xprt->rx_stats.mrs_recovered++;
 	return;
 
 out_release:
-	pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
+	pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
 	r_xprt->rx_stats.mrs_orphaned++;
 
-	spin_lock(&r_xprt->rx_buf.rb_mwlock);
-	list_del(&mw->mw_all);
-	spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+	trace_xprtrdma_dma_unmap(mr);
+	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+			mr->mr_sg, mr->mr_nents, mr->mr_dir);
+
+	spin_lock(&r_xprt->rx_buf.rb_mrlock);
+	list_del(&mr->mr_all);
+	spin_unlock(&r_xprt->rx_buf.rb_mrlock);
 
-	fmr_op_release_mr(mw);
+	fmr_op_release_mr(mr);
 }
 
 static int
@@ -180,15 +182,15 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
  */
 static struct rpcrdma_mr_seg *
 fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-	   int nsegs, bool writing, struct rpcrdma_mw **out)
+	   int nsegs, bool writing, struct rpcrdma_mr **out)
 {
 	struct rpcrdma_mr_seg *seg1 = seg;
 	int len, pageoff, i, rc;
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 	u64 *dma_pages;
 
-	mw = rpcrdma_get_mw(r_xprt);
-	if (!mw)
+	mr = rpcrdma_mr_get(r_xprt);
+	if (!mr)
 		return ERR_PTR(-ENOBUFS);
 
 	pageoff = offset_in_page(seg1->mr_offset);
@@ -199,12 +201,12 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		nsegs = RPCRDMA_MAX_FMR_SGES;
 	for (i = 0; i < nsegs;) {
 		if (seg->mr_page)
-			sg_set_page(&mw->mw_sg[i],
+			sg_set_page(&mr->mr_sg[i],
 				    seg->mr_page,
 				    seg->mr_len,
 				    offset_in_page(seg->mr_offset));
 		else
-			sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
 				   seg->mr_len);
 		len += seg->mr_len;
 		++seg;
@@ -214,40 +216,38 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
 	}
-	mw->mw_dir = rpcrdma_data_dir(writing);
+	mr->mr_dir = rpcrdma_data_dir(writing);
 
-	mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
-				     mw->mw_sg, i, mw->mw_dir);
-	if (!mw->mw_nents)
+	mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+				     mr->mr_sg, i, mr->mr_dir);
+	if (!mr->mr_nents)
 		goto out_dmamap_err;
 
-	for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
-		dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
-	rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
+	for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
+		dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
+	rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents,
 			     dma_pages[0]);
 	if (rc)
 		goto out_maperr;
 
-	mw->mw_handle = mw->fmr.fm_mr->rkey;
-	mw->mw_length = len;
-	mw->mw_offset = dma_pages[0] + pageoff;
+	mr->mr_handle = mr->fmr.fm_mr->rkey;
+	mr->mr_length = len;
+	mr->mr_offset = dma_pages[0] + pageoff;
 
-	*out = mw;
+	*out = mr;
 	return seg;
 
 out_dmamap_err:
 	pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
-	       mw->mw_sg, i);
-	rpcrdma_put_mw(r_xprt, mw);
+	       mr->mr_sg, i);
+	rpcrdma_mr_put(mr);
 	return ERR_PTR(-EIO);
 
 out_maperr:
 	pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
 	       len, (unsigned long long)dma_pages[0],
-	       pageoff, mw->mw_nents, rc);
-	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
-			mw->mw_sg, mw->mw_nents, mw->mw_dir);
-	rpcrdma_put_mw(r_xprt, mw);
+	       pageoff, mr->mr_nents, rc);
+	rpcrdma_mr_unmap_and_put(mr);
 	return ERR_PTR(-EIO);
 }
 
@@ -256,13 +256,13 @@ out_maperr:
  * Sleeps until it is safe for the host CPU to access the
  * previously mapped memory regions.
  *
- * Caller ensures that @mws is not empty before the call. This
+ * Caller ensures that @mrs is not empty before the call. This
  * function empties the list.
  */
 static void
-fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
+fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
 {
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 	LIST_HEAD(unmap_list);
 	int rc;
 
@@ -271,10 +271,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
 	 * ib_unmap_fmr() is slow, so use a single call instead
 	 * of one call per mapped FMR.
 	 */
-	list_for_each_entry(mw, mws, mw_list) {
+	list_for_each_entry(mr, mrs, mr_list) {
 		dprintk("RPC:       %s: unmapping fmr %p\n",
-			__func__, &mw->fmr);
-		list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
+			__func__, &mr->fmr);
+		trace_xprtrdma_localinv(mr);
+		list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
 	}
 	r_xprt->rx_stats.local_inv_needed++;
 	rc = ib_unmap_fmr(&unmap_list);
@@ -284,14 +285,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
 	/* ORDER: Now DMA unmap all of the req's MRs, and return
 	 * them to the free MW list.
 	 */
-	while (!list_empty(mws)) {
-		mw = rpcrdma_pop_mw(mws);
-		dprintk("RPC:       %s: DMA unmapping fmr %p\n",
-			__func__, &mw->fmr);
-		list_del(&mw->fmr.fm_mr->list);
-		ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
-				mw->mw_sg, mw->mw_nents, mw->mw_dir);
-		rpcrdma_put_mw(r_xprt, mw);
+	while (!list_empty(mrs)) {
+		mr = rpcrdma_mr_pop(mrs);
+		list_del(&mr->fmr.fm_mr->list);
+		rpcrdma_mr_unmap_and_put(mr);
 	}
 
 	return;
@@ -299,10 +296,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
 out_reset:
 	pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
 
-	while (!list_empty(mws)) {
-		mw = rpcrdma_pop_mw(mws);
-		list_del(&mw->fmr.fm_mr->list);
-		fmr_op_recover_mr(mw);
+	while (!list_empty(mrs)) {
+		mr = rpcrdma_mr_pop(mrs);
+		list_del(&mr->fmr.fm_mr->list);
+		fmr_op_recover_mr(mr);
 	}
 }
 

+ 169 - 160
net/sunrpc/xprtrdma/frwr_ops.c

@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  */
 
 /* Lightweight memory registration using Fast Registration Work
- * Requests (FRWR). Also referred to sometimes as FRMR mode.
+ * Requests (FRWR).
  *
  * FRWR features ordered asynchronous registration and deregistration
  * of arbitrarily sized memory regions. This is the fastest and safest
@@ -15,9 +15,9 @@
 /* Normal operation
  *
  * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
- * Work Request (frmr_op_map). When the RDMA operation is finished, this
+ * Work Request (frwr_op_map). When the RDMA operation is finished, this
  * Memory Region is invalidated using a LOCAL_INV Work Request
- * (frmr_op_unmap).
+ * (frwr_op_unmap_sync).
  *
  * Typically these Work Requests are not signaled, and neither are RDMA
  * SEND Work Requests (with the exception of signaling occasionally to
@@ -26,7 +26,7 @@
  *
  * As an optimization, frwr_op_unmap marks MRs INVALID before the
  * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
- * rb_mws immediately so that no work (like managing a linked list
+ * rb_mrs immediately so that no work (like managing a linked list
  * under a spinlock) is needed in the completion upcall.
  *
  * But this means that frwr_op_map() can occasionally encounter an MR
@@ -60,7 +60,7 @@
  * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
  * with ib_dereg_mr and then are re-initialized. Because MR recovery
  * allocates fresh resources, it is deferred to a workqueue, and the
- * recovered MRs are placed back on the rb_mws list when recovery is
+ * recovered MRs are placed back on the rb_mrs list when recovery is
  * complete. frwr_op_map allocates another MR for the current RPC while
  * the broken MR is reset.
  *
@@ -96,26 +96,26 @@ out_not_supported:
 }
 
 static int
-frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 {
-	unsigned int depth = ia->ri_max_frmr_depth;
-	struct rpcrdma_frmr *f = &r->frmr;
+	unsigned int depth = ia->ri_max_frwr_depth;
+	struct rpcrdma_frwr *frwr = &mr->frwr;
 	int rc;
 
-	f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
-	if (IS_ERR(f->fr_mr))
+	frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+	if (IS_ERR(frwr->fr_mr))
 		goto out_mr_err;
 
-	r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
-	if (!r->mw_sg)
+	mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL);
+	if (!mr->mr_sg)
 		goto out_list_err;
 
-	sg_init_table(r->mw_sg, depth);
-	init_completion(&f->fr_linv_done);
+	sg_init_table(mr->mr_sg, depth);
+	init_completion(&frwr->fr_linv_done);
 	return 0;
 
 out_mr_err:
-	rc = PTR_ERR(f->fr_mr);
+	rc = PTR_ERR(frwr->fr_mr);
 	dprintk("RPC:       %s: ib_alloc_mr status %i\n",
 		__func__, rc);
 	return rc;
@@ -124,83 +124,85 @@ out_list_err:
 	rc = -ENOMEM;
 	dprintk("RPC:       %s: sg allocation failure\n",
 		__func__);
-	ib_dereg_mr(f->fr_mr);
+	ib_dereg_mr(frwr->fr_mr);
 	return rc;
 }
 
 static void
-frwr_op_release_mr(struct rpcrdma_mw *r)
+frwr_op_release_mr(struct rpcrdma_mr *mr)
 {
 	int rc;
 
-	/* Ensure MW is not on any rl_registered list */
-	if (!list_empty(&r->mw_list))
-		list_del(&r->mw_list);
+	/* Ensure MR is not on any rl_registered list */
+	if (!list_empty(&mr->mr_list))
+		list_del(&mr->mr_list);
 
-	rc = ib_dereg_mr(r->frmr.fr_mr);
+	rc = ib_dereg_mr(mr->frwr.fr_mr);
 	if (rc)
 		pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
-		       r, rc);
-	kfree(r->mw_sg);
-	kfree(r);
+		       mr, rc);
+	kfree(mr->mr_sg);
+	kfree(mr);
 }
 
 static int
-__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 {
-	struct rpcrdma_frmr *f = &r->frmr;
+	struct rpcrdma_frwr *frwr = &mr->frwr;
 	int rc;
 
-	rc = ib_dereg_mr(f->fr_mr);
+	rc = ib_dereg_mr(frwr->fr_mr);
 	if (rc) {
 		pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
-			rc, r);
+			rc, mr);
 		return rc;
 	}
 
-	f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
-			       ia->ri_max_frmr_depth);
-	if (IS_ERR(f->fr_mr)) {
+	frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
+				  ia->ri_max_frwr_depth);
+	if (IS_ERR(frwr->fr_mr)) {
 		pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
-			PTR_ERR(f->fr_mr), r);
-		return PTR_ERR(f->fr_mr);
+			PTR_ERR(frwr->fr_mr), mr);
+		return PTR_ERR(frwr->fr_mr);
 	}
 
-	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, f);
-	f->fr_state = FRMR_IS_INVALID;
+	dprintk("RPC:       %s: recovered FRWR %p\n", __func__, frwr);
+	frwr->fr_state = FRWR_IS_INVALID;
 	return 0;
 }
 
-/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
+/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR.
  */
 static void
-frwr_op_recover_mr(struct rpcrdma_mw *mw)
+frwr_op_recover_mr(struct rpcrdma_mr *mr)
 {
-	enum rpcrdma_frmr_state state = mw->frmr.fr_state;
-	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+	enum rpcrdma_frwr_state state = mr->frwr.fr_state;
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	int rc;
 
-	rc = __frwr_reset_mr(ia, mw);
-	if (state != FRMR_FLUSHED_LI)
+	rc = __frwr_mr_reset(ia, mr);
+	if (state != FRWR_FLUSHED_LI) {
+		trace_xprtrdma_dma_unmap(mr);
 		ib_dma_unmap_sg(ia->ri_device,
-				mw->mw_sg, mw->mw_nents, mw->mw_dir);
+				mr->mr_sg, mr->mr_nents, mr->mr_dir);
+	}
 	if (rc)
 		goto out_release;
 
-	rpcrdma_put_mw(r_xprt, mw);
+	rpcrdma_mr_put(mr);
 	r_xprt->rx_stats.mrs_recovered++;
 	return;
 
 out_release:
-	pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
+	pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mr);
 	r_xprt->rx_stats.mrs_orphaned++;
 
-	spin_lock(&r_xprt->rx_buf.rb_mwlock);
-	list_del(&mw->mw_all);
-	spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+	spin_lock(&r_xprt->rx_buf.rb_mrlock);
+	list_del(&mr->mr_all);
+	spin_unlock(&r_xprt->rx_buf.rb_mrlock);
 
-	frwr_op_release_mr(mw);
+	frwr_op_release_mr(mr);
 }
 
 static int
@@ -214,31 +216,31 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
 		ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
 
-	ia->ri_max_frmr_depth =
+	ia->ri_max_frwr_depth =
 			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
 			      attrs->max_fast_reg_page_list_len);
 	dprintk("RPC:       %s: device's max FR page list len = %u\n",
-		__func__, ia->ri_max_frmr_depth);
-
-	/* Add room for frmr register and invalidate WRs.
-	 * 1. FRMR reg WR for head
-	 * 2. FRMR invalidate WR for head
-	 * 3. N FRMR reg WRs for pagelist
-	 * 4. N FRMR invalidate WRs for pagelist
-	 * 5. FRMR reg WR for tail
-	 * 6. FRMR invalidate WR for tail
+		__func__, ia->ri_max_frwr_depth);
+
+	/* Add room for frwr register and invalidate WRs.
+	 * 1. FRWR reg WR for head
+	 * 2. FRWR invalidate WR for head
+	 * 3. N FRWR reg WRs for pagelist
+	 * 4. N FRWR invalidate WRs for pagelist
+	 * 5. FRWR reg WR for tail
+	 * 6. FRWR invalidate WR for tail
 	 * 7. The RDMA_SEND WR
 	 */
 	depth = 7;
 
-	/* Calculate N if the device max FRMR depth is smaller than
+	/* Calculate N if the device max FRWR depth is smaller than
 	 * RPCRDMA_MAX_DATA_SEGS.
 	 */
-	if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
-		delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
+	if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
+		delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
 		do {
-			depth += 2; /* FRMR reg + invalidate */
-			delta -= ia->ri_max_frmr_depth;
+			depth += 2; /* FRWR reg + invalidate */
+			delta -= ia->ri_max_frwr_depth;
 		} while (delta > 0);
 	}
 
@@ -252,7 +254,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	}
 
 	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
-				ia->ri_max_frmr_depth);
+				ia->ri_max_frwr_depth);
 	return 0;
 }
 
@@ -265,7 +267,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
 	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-		     RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
+		     RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth);
 }
 
 static void
@@ -286,16 +288,16 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
 static void
 frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct rpcrdma_frmr *frmr;
-	struct ib_cqe *cqe;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+			container_of(cqe, struct rpcrdma_frwr, fr_cqe);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	if (wc->status != IB_WC_SUCCESS) {
-		cqe = wc->wr_cqe;
-		frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
-		frmr->fr_state = FRMR_FLUSHED_FR;
+		frwr->fr_state = FRWR_FLUSHED_FR;
 		__frwr_sendcompletion_flush(wc, "fastreg");
 	}
+	trace_xprtrdma_wc_fastreg(wc, frwr);
 }
 
 /**
@@ -307,16 +309,16 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
 static void
 frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct rpcrdma_frmr *frmr;
-	struct ib_cqe *cqe;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
+						 fr_cqe);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	if (wc->status != IB_WC_SUCCESS) {
-		cqe = wc->wr_cqe;
-		frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
-		frmr->fr_state = FRMR_FLUSHED_LI;
+		frwr->fr_state = FRWR_FLUSHED_LI;
 		__frwr_sendcompletion_flush(wc, "localinv");
 	}
+	trace_xprtrdma_wc_li(wc, frwr);
 }
 
 /**
@@ -329,17 +331,17 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 static void
 frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct rpcrdma_frmr *frmr;
-	struct ib_cqe *cqe;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
+						 fr_cqe);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	cqe = wc->wr_cqe;
-	frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
 	if (wc->status != IB_WC_SUCCESS) {
-		frmr->fr_state = FRMR_FLUSHED_LI;
+		frwr->fr_state = FRWR_FLUSHED_LI;
 		__frwr_sendcompletion_flush(wc, "localinv");
 	}
-	complete(&frmr->fr_linv_done);
+	complete(&frwr->fr_linv_done);
+	trace_xprtrdma_wc_li_wake(wc, frwr);
 }
 
 /* Post a REG_MR Work Request to register a memory region
@@ -347,41 +349,39 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
  */
 static struct rpcrdma_mr_seg *
 frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-	    int nsegs, bool writing, struct rpcrdma_mw **out)
+	    int nsegs, bool writing, struct rpcrdma_mr **out)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
-	struct rpcrdma_mw *mw;
-	struct rpcrdma_frmr *frmr;
-	struct ib_mr *mr;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
+	struct ib_mr *ibmr;
 	struct ib_reg_wr *reg_wr;
 	struct ib_send_wr *bad_wr;
 	int rc, i, n;
 	u8 key;
 
-	mw = NULL;
+	mr = NULL;
 	do {
-		if (mw)
-			rpcrdma_defer_mr_recovery(mw);
-		mw = rpcrdma_get_mw(r_xprt);
-		if (!mw)
+		if (mr)
+			rpcrdma_mr_defer_recovery(mr);
+		mr = rpcrdma_mr_get(r_xprt);
+		if (!mr)
 			return ERR_PTR(-ENOBUFS);
-	} while (mw->frmr.fr_state != FRMR_IS_INVALID);
-	frmr = &mw->frmr;
-	frmr->fr_state = FRMR_IS_VALID;
-	mr = frmr->fr_mr;
-	reg_wr = &frmr->fr_regwr;
-
-	if (nsegs > ia->ri_max_frmr_depth)
-		nsegs = ia->ri_max_frmr_depth;
+	} while (mr->frwr.fr_state != FRWR_IS_INVALID);
+	frwr = &mr->frwr;
+	frwr->fr_state = FRWR_IS_VALID;
+
+	if (nsegs > ia->ri_max_frwr_depth)
+		nsegs = ia->ri_max_frwr_depth;
 	for (i = 0; i < nsegs;) {
 		if (seg->mr_page)
-			sg_set_page(&mw->mw_sg[i],
+			sg_set_page(&mr->mr_sg[i],
 				    seg->mr_page,
 				    seg->mr_len,
 				    offset_in_page(seg->mr_offset));
 		else
-			sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
 				   seg->mr_len);
 
 		++seg;
@@ -392,30 +392,29 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
 	}
-	mw->mw_dir = rpcrdma_data_dir(writing);
+	mr->mr_dir = rpcrdma_data_dir(writing);
 
-	mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir);
-	if (!mw->mw_nents)
+	mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
+	if (!mr->mr_nents)
 		goto out_dmamap_err;
 
-	n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
-	if (unlikely(n != mw->mw_nents))
+	ibmr = frwr->fr_mr;
+	n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
+	if (unlikely(n != mr->mr_nents))
 		goto out_mapmr_err;
 
-	dprintk("RPC:       %s: Using frmr %p to map %u segments (%llu bytes)\n",
-		__func__, frmr, mw->mw_nents, mr->length);
-
-	key = (u8)(mr->rkey & 0x000000FF);
-	ib_update_fast_reg_key(mr, ++key);
+	key = (u8)(ibmr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(ibmr, ++key);
 
+	reg_wr = &frwr->fr_regwr;
 	reg_wr->wr.next = NULL;
 	reg_wr->wr.opcode = IB_WR_REG_MR;
-	frmr->fr_cqe.done = frwr_wc_fastreg;
-	reg_wr->wr.wr_cqe = &frmr->fr_cqe;
+	frwr->fr_cqe.done = frwr_wc_fastreg;
+	reg_wr->wr.wr_cqe = &frwr->fr_cqe;
 	reg_wr->wr.num_sge = 0;
 	reg_wr->wr.send_flags = 0;
-	reg_wr->mr = mr;
-	reg_wr->key = mr->rkey;
+	reg_wr->mr = ibmr;
+	reg_wr->key = ibmr->rkey;
 	reg_wr->access = writing ?
 			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
 			 IB_ACCESS_REMOTE_READ;
@@ -424,47 +423,64 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (rc)
 		goto out_senderr;
 
-	mw->mw_handle = mr->rkey;
-	mw->mw_length = mr->length;
-	mw->mw_offset = mr->iova;
+	mr->mr_handle = ibmr->rkey;
+	mr->mr_length = ibmr->length;
+	mr->mr_offset = ibmr->iova;
 
-	*out = mw;
+	*out = mr;
 	return seg;
 
 out_dmamap_err:
 	pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
-	       mw->mw_sg, i);
-	frmr->fr_state = FRMR_IS_INVALID;
-	rpcrdma_put_mw(r_xprt, mw);
+	       mr->mr_sg, i);
+	frwr->fr_state = FRWR_IS_INVALID;
+	rpcrdma_mr_put(mr);
 	return ERR_PTR(-EIO);
 
 out_mapmr_err:
 	pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
-	       frmr->fr_mr, n, mw->mw_nents);
-	rpcrdma_defer_mr_recovery(mw);
+	       frwr->fr_mr, n, mr->mr_nents);
+	rpcrdma_mr_defer_recovery(mr);
 	return ERR_PTR(-EIO);
 
 out_senderr:
-	pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
-	rpcrdma_defer_mr_recovery(mw);
+	pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc);
+	rpcrdma_mr_defer_recovery(mr);
 	return ERR_PTR(-ENOTCONN);
 }
 
+/* Handle a remotely invalidated mr on the @mrs list
+ */
+static void
+frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
+{
+	struct rpcrdma_mr *mr;
+
+	list_for_each_entry(mr, mrs, mr_list)
+		if (mr->mr_handle == rep->rr_inv_rkey) {
+			list_del(&mr->mr_list);
+			trace_xprtrdma_remoteinv(mr);
+			mr->frwr.fr_state = FRWR_IS_INVALID;
+			rpcrdma_mr_unmap_and_put(mr);
+			break;	/* only one invalidated MR per RPC */
+		}
+}
+
 /* Invalidate all memory regions that were registered for "req".
  *
  * Sleeps until it is safe for the host CPU to access the
  * previously mapped memory regions.
  *
- * Caller ensures that @mws is not empty before the call. This
+ * Caller ensures that @mrs is not empty before the call. This
  * function empties the list.
  */
 static void
-frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
+frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
 {
 	struct ib_send_wr *first, **prev, *last, *bad_wr;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_frmr *f;
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
 	int count, rc;
 
 	/* ORDER: Invalidate all of the MRs first
@@ -472,31 +488,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
 	 * Chain the LOCAL_INV Work Requests and post them with
 	 * a single ib_post_send() call.
 	 */
-	f = NULL;
+	frwr = NULL;
 	count = 0;
 	prev = &first;
-	list_for_each_entry(mw, mws, mw_list) {
-		mw->frmr.fr_state = FRMR_IS_INVALID;
+	list_for_each_entry(mr, mrs, mr_list) {
+		mr->frwr.fr_state = FRWR_IS_INVALID;
 
-		if (mw->mw_flags & RPCRDMA_MW_F_RI)
-			continue;
+		frwr = &mr->frwr;
+		trace_xprtrdma_localinv(mr);
 
-		f = &mw->frmr;
-		dprintk("RPC:       %s: invalidating frmr %p\n",
-			__func__, f);
-
-		f->fr_cqe.done = frwr_wc_localinv;
-		last = &f->fr_invwr;
+		frwr->fr_cqe.done = frwr_wc_localinv;
+		last = &frwr->fr_invwr;
 		memset(last, 0, sizeof(*last));
-		last->wr_cqe = &f->fr_cqe;
+		last->wr_cqe = &frwr->fr_cqe;
 		last->opcode = IB_WR_LOCAL_INV;
-		last->ex.invalidate_rkey = mw->mw_handle;
+		last->ex.invalidate_rkey = mr->mr_handle;
 		count++;
 
 		*prev = last;
 		prev = &last->next;
 	}
-	if (!f)
+	if (!frwr)
 		goto unmap;
 
 	/* Strong send queue ordering guarantees that when the
@@ -504,8 +516,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
 	 * are complete.
 	 */
 	last->send_flags = IB_SEND_SIGNALED;
-	f->fr_cqe.done = frwr_wc_localinv_wake;
-	reinit_completion(&f->fr_linv_done);
+	frwr->fr_cqe.done = frwr_wc_localinv_wake;
+	reinit_completion(&frwr->fr_linv_done);
 
 	/* Transport disconnect drains the receive CQ before it
 	 * replaces the QP. The RPC reply handler won't call us
@@ -515,36 +527,32 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws)
 	bad_wr = NULL;
 	rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
 	if (bad_wr != first)
-		wait_for_completion(&f->fr_linv_done);
+		wait_for_completion(&frwr->fr_linv_done);
 	if (rc)
 		goto reset_mrs;
 
 	/* ORDER: Now DMA unmap all of the MRs, and return
-	 * them to the free MW list.
+	 * them to the free MR list.
 	 */
 unmap:
-	while (!list_empty(mws)) {
-		mw = rpcrdma_pop_mw(mws);
-		dprintk("RPC:       %s: DMA unmapping frmr %p\n",
-			__func__, &mw->frmr);
-		ib_dma_unmap_sg(ia->ri_device,
-				mw->mw_sg, mw->mw_nents, mw->mw_dir);
-		rpcrdma_put_mw(r_xprt, mw);
+	while (!list_empty(mrs)) {
+		mr = rpcrdma_mr_pop(mrs);
+		rpcrdma_mr_unmap_and_put(mr);
 	}
 	return;
 
 reset_mrs:
-	pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
+	pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
 
 	/* Find and reset the MRs in the LOCAL_INV WRs that did not
 	 * get posted.
 	 */
 	while (bad_wr) {
-		f = container_of(bad_wr, struct rpcrdma_frmr,
-				 fr_invwr);
-		mw = container_of(f, struct rpcrdma_mw, frmr);
+		frwr = container_of(bad_wr, struct rpcrdma_frwr,
+				    fr_invwr);
+		mr = container_of(frwr, struct rpcrdma_mr, frwr);
 
-		__frwr_reset_mr(ia, mw);
+		__frwr_mr_reset(ia, mr);
 
 		bad_wr = bad_wr->next;
 	}
@@ -553,6 +561,7 @@ reset_mrs:
 
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
+	.ro_reminv			= frwr_op_reminv,
 	.ro_unmap_sync			= frwr_op_unmap_sync,
 	.ro_recover_mr			= frwr_op_recover_mr,
 	.ro_open			= frwr_op_open,

+ 7 - 5
net/sunrpc/xprtrdma/module.c

@@ -1,18 +1,20 @@
 /*
- * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
  */
 
 /* rpcrdma.ko module initialization
  */
 
+#include <linux/types.h>
+#include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sunrpc/svc_rdma.h>
-#include "xprt_rdma.h"
 
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY	RPCDBG_TRANS
-#endif
+#include <asm/swab.h>
+
+#define CREATE_TRACE_POINTS
+#include "xprt_rdma.h"
 
 MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc.");
 MODULE_DESCRIPTION("RPC/RDMA Transport");

+ 59 - 103
net/sunrpc/xprtrdma/rpc_rdma.c

@@ -292,15 +292,15 @@ encode_item_not_present(struct xdr_stream *xdr)
 }
 
 static void
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
 {
-	*iptr++ = cpu_to_be32(mw->mw_handle);
-	*iptr++ = cpu_to_be32(mw->mw_length);
-	xdr_encode_hyper(iptr, mw->mw_offset);
+	*iptr++ = cpu_to_be32(mr->mr_handle);
+	*iptr++ = cpu_to_be32(mr->mr_length);
+	xdr_encode_hyper(iptr, mr->mr_offset);
 }
 
 static int
-encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw)
+encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
 {
 	__be32 *p;
 
@@ -308,12 +308,12 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw)
 	if (unlikely(!p))
 		return -EMSGSIZE;
 
-	xdr_encode_rdma_segment(p, mw);
+	xdr_encode_rdma_segment(p, mr);
 	return 0;
 }
 
 static int
-encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw,
+encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
 		    u32 position)
 {
 	__be32 *p;
@@ -324,7 +324,7 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw,
 
 	*p++ = xdr_one;			/* Item present */
 	*p++ = cpu_to_be32(position);
-	xdr_encode_rdma_segment(p, mw);
+	xdr_encode_rdma_segment(p, mr);
 	return 0;
 }
 
@@ -348,7 +348,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 {
 	struct xdr_stream *xdr = &req->rl_stream;
 	struct rpcrdma_mr_seg *seg;
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 	unsigned int pos;
 	int nsegs;
 
@@ -363,21 +363,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 
 	do {
 		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
-						   false, &mw);
+						   false, &mr);
 		if (IS_ERR(seg))
 			return PTR_ERR(seg);
-		rpcrdma_push_mw(mw, &req->rl_registered);
+		rpcrdma_mr_push(mr, &req->rl_registered);
 
-		if (encode_read_segment(xdr, mw, pos) < 0)
+		if (encode_read_segment(xdr, mr, pos) < 0)
 			return -EMSGSIZE;
 
-		dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
-			rqst->rq_task->tk_pid, __func__, pos,
-			mw->mw_length, (unsigned long long)mw->mw_offset,
-			mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
-
+		trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs);
 		r_xprt->rx_stats.read_chunk_count++;
-		nsegs -= mw->mw_nents;
+		nsegs -= mr->mr_nents;
 	} while (nsegs);
 
 	return 0;
@@ -404,7 +400,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 {
 	struct xdr_stream *xdr = &req->rl_stream;
 	struct rpcrdma_mr_seg *seg;
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 	int nsegs, nchunks;
 	__be32 *segcount;
 
@@ -425,23 +421,19 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	nchunks = 0;
 	do {
 		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
-						   true, &mw);
+						   true, &mr);
 		if (IS_ERR(seg))
 			return PTR_ERR(seg);
-		rpcrdma_push_mw(mw, &req->rl_registered);
+		rpcrdma_mr_push(mr, &req->rl_registered);
 
-		if (encode_rdma_segment(xdr, mw) < 0)
+		if (encode_rdma_segment(xdr, mr) < 0)
 			return -EMSGSIZE;
 
-		dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
-			rqst->rq_task->tk_pid, __func__,
-			mw->mw_length, (unsigned long long)mw->mw_offset,
-			mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
-
+		trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs);
 		r_xprt->rx_stats.write_chunk_count++;
-		r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
 		nchunks++;
-		nsegs -= mw->mw_nents;
+		nsegs -= mr->mr_nents;
 	} while (nsegs);
 
 	/* Update count of segments in this Write chunk */
@@ -468,7 +460,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 {
 	struct xdr_stream *xdr = &req->rl_stream;
 	struct rpcrdma_mr_seg *seg;
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 	int nsegs, nchunks;
 	__be32 *segcount;
 
@@ -487,23 +479,19 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	nchunks = 0;
 	do {
 		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
-						   true, &mw);
+						   true, &mr);
 		if (IS_ERR(seg))
 			return PTR_ERR(seg);
-		rpcrdma_push_mw(mw, &req->rl_registered);
+		rpcrdma_mr_push(mr, &req->rl_registered);
 
-		if (encode_rdma_segment(xdr, mw) < 0)
+		if (encode_rdma_segment(xdr, mr) < 0)
 			return -EMSGSIZE;
 
-		dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
-			rqst->rq_task->tk_pid, __func__,
-			mw->mw_length, (unsigned long long)mw->mw_offset,
-			mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
-
+		trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs);
 		r_xprt->rx_stats.reply_chunk_count++;
-		r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
 		nchunks++;
-		nsegs -= mw->mw_nents;
+		nsegs -= mr->mr_nents;
 	} while (nsegs);
 
 	/* Update count of segments in the Reply chunk */
@@ -524,9 +512,6 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
 	struct ib_sge *sge;
 	unsigned int count;
 
-	dprintk("RPC:       %s: unmapping %u sges for sc=%p\n",
-		__func__, sc->sc_unmap_count, sc);
-
 	/* The first two SGEs contain the transport header and
 	 * the inline buffer. These are always left mapped so
 	 * they can be cheaply re-used.
@@ -754,11 +739,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 	__be32 *p;
 	int ret;
 
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
-		return rpcrdma_bc_marshal_reply(rqst);
-#endif
-
 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
 	xdr_init_encode(xdr, &req->rl_hdrbuf,
 			req->rl_rdmabuf->rg_base);
@@ -821,6 +801,17 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 		rtype = rpcrdma_areadch;
 	}
 
+	/* If this is a retransmit, discard previously registered
+	 * chunks. Very likely the connection has been replaced,
+	 * so these registrations are invalid and unusable.
+	 */
+	while (unlikely(!list_empty(&req->rl_registered))) {
+		struct rpcrdma_mr *mr;
+
+		mr = rpcrdma_mr_pop(&req->rl_registered);
+		rpcrdma_mr_defer_recovery(mr);
+	}
+
 	/* This implementation supports the following combinations
 	 * of chunk lists in one RPC-over-RDMA Call message:
 	 *
@@ -868,10 +859,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 	if (ret)
 		goto out_err;
 
-	dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n",
-		rqst->rq_task->tk_pid, __func__,
-		transfertypes[rtype], transfertypes[wtype],
-		xdr_stream_pos(xdr));
+	trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
 
 	ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
 					&rqst->rq_snd_buf, rtype);
@@ -926,8 +914,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 	curlen = rqst->rq_rcv_buf.head[0].iov_len;
 	if (curlen > copy_len)
 		curlen = copy_len;
-	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
-		__func__, srcp, copy_len, curlen);
+	trace_xprtrdma_fixup(rqst, copy_len, curlen);
 	srcp += curlen;
 	copy_len -= curlen;
 
@@ -947,9 +934,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 			if (curlen > pagelist_len)
 				curlen = pagelist_len;
 
-			dprintk("RPC:       %s: page %d"
-				" srcp 0x%p len %d curlen %d\n",
-				__func__, i, srcp, copy_len, curlen);
+			trace_xprtrdma_fixup_pg(rqst, i, srcp,
+						copy_len, curlen);
 			destp = kmap_atomic(ppages[i]);
 			memcpy(destp + page_base, srcp, curlen);
 			flush_dcache_page(ppages[i]);
@@ -984,24 +970,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 	return fixup_copy_count;
 }
 
-/* Caller must guarantee @rep remains stable during this call.
- */
-static void
-rpcrdma_mark_remote_invalidation(struct list_head *mws,
-				 struct rpcrdma_rep *rep)
-{
-	struct rpcrdma_mw *mw;
-
-	if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE))
-		return;
-
-	list_for_each_entry(mw, mws, mw_list)
-		if (mw->mw_handle == rep->rr_inv_rkey) {
-			mw->mw_flags = RPCRDMA_MW_F_RI;
-			break; /* only one invalidated MR per RPC */
-		}
-}
-
 /* By convention, backchannel calls arrive via rdma_msg type
  * messages, and never populate the chunk lists. This makes
  * the RPC/RDMA header small and fixed in size, so it is
@@ -1058,26 +1026,19 @@ out_short:
 
 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
 {
+	u32 handle;
+	u64 offset;
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4 * sizeof(*p));
 	if (unlikely(!p))
 		return -EIO;
 
-	ifdebug(FACILITY) {
-		u64 offset;
-		u32 handle;
-
-		handle = be32_to_cpup(p++);
-		*length = be32_to_cpup(p++);
-		xdr_decode_hyper(p, &offset);
-		dprintk("RPC:       %s:   segment %u@0x%016llx:0x%08x\n",
-			__func__, *length, (unsigned long long)offset,
-			handle);
-	} else {
-		*length = be32_to_cpup(p + 1);
-	}
+	handle = be32_to_cpup(p++);
+	*length = be32_to_cpup(p++);
+	xdr_decode_hyper(p, &offset);
 
+	trace_xprtrdma_decode_seg(handle, *length, offset);
 	return 0;
 }
 
@@ -1098,8 +1059,6 @@ static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
 		*length += seglength;
 	}
 
-	dprintk("RPC:       %s: segcount=%u, %u bytes\n",
-		__func__, be32_to_cpup(p), *length);
 	return 0;
 }
 
@@ -1296,8 +1255,7 @@ out:
  * being marshaled.
  */
 out_badheader:
-	dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
-		rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc));
+	trace_xprtrdma_reply_hdr(rep);
 	r_xprt->rx_stats.bad_reply_count++;
 	status = -EIO;
 	goto out;
@@ -1339,9 +1297,12 @@ void rpcrdma_deferred_completion(struct work_struct *work)
 	struct rpcrdma_rep *rep =
 			container_of(work, struct rpcrdma_rep, rr_work);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 
-	rpcrdma_mark_remote_invalidation(&req->rl_registered, rep);
-	rpcrdma_release_rqst(rep->rr_rxprt, req);
+	trace_xprtrdma_defer_cmp(rep);
+	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+		r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered);
+	rpcrdma_release_rqst(r_xprt, req);
 	rpcrdma_complete_rqst(rep);
 }
 
@@ -1360,8 +1321,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	u32 credits;
 	__be32 *p;
 
-	dprintk("RPC:       %s: incoming rep %p\n", __func__, rep);
-
 	if (rep->rr_hdrbuf.head[0].iov_len == 0)
 		goto out_badstatus;
 
@@ -1405,8 +1364,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	rep->rr_rqst = rqst;
 	clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
 
-	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
-		__func__, rep, req, be32_to_cpu(rep->rr_xid));
+	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
 
 	queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work);
 	return;
@@ -1420,8 +1378,7 @@ out_badstatus:
 	return;
 
 out_badversion:
-	dprintk("RPC:       %s: invalid version %d\n",
-		__func__, be32_to_cpu(rep->rr_vers));
+	trace_xprtrdma_reply_vers(rep);
 	goto repost;
 
 /* The RPC transaction has already been terminated, or the header
@@ -1429,12 +1386,11 @@ out_badversion:
  */
 out_norqst:
 	spin_unlock(&xprt->recv_lock);
-	dprintk("RPC:       %s: no match for incoming xid 0x%08x\n",
-		__func__, be32_to_cpu(rep->rr_xid));
+	trace_xprtrdma_reply_rqst(rep);
 	goto repost;
 
 out_shortreply:
-	dprintk("RPC:       %s: short/invalid reply\n", __func__);
+	trace_xprtrdma_reply_short(rep);
 
 /* If no pending RPC transaction was matched, post a replacement
  * receive buffer before returning.

+ 55 - 73
net/sunrpc/xprtrdma/transport.c

@@ -67,8 +67,7 @@
 static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
 unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
-static unsigned int xprt_rdma_inline_write_padding;
-unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRMR;
+unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
 int xprt_rdma_pad_optimize;
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -81,6 +80,7 @@ static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
 static unsigned int max_memreg = RPCRDMA_LAST - 1;
+static unsigned int dummy;
 
 static struct ctl_table_header *sunrpc_table_header;
 
@@ -114,7 +114,7 @@ static struct ctl_table xr_tunables_table[] = {
 	},
 	{
 		.procname	= "rdma_inline_write_padding",
-		.data		= &xprt_rdma_inline_write_padding,
+		.data		= &dummy,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -259,13 +259,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
 
 	xprt_clear_connected(xprt);
 
-	dprintk("RPC:       %s: %sconnect\n", __func__,
-			r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
 	rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
 	if (rc)
 		xprt_wake_pending_tasks(xprt, rc);
 
-	dprintk("RPC:       %s: exit\n", __func__);
 	xprt_clear_connecting(xprt);
 }
 
@@ -275,7 +272,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
 	struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
 						   rx_xprt);
 
-	pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+	trace_xprtrdma_inject_dsc(r_xprt);
 	rdma_disconnect(r_xprt->rx_ia.ri_id);
 }
 
@@ -295,7 +292,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
-	dprintk("RPC:       %s: called\n", __func__);
+	trace_xprtrdma_destroy(r_xprt);
 
 	cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
 
@@ -306,11 +303,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 	rpcrdma_ia_close(&r_xprt->rx_ia);
 
 	xprt_rdma_free_addresses(xprt);
-
 	xprt_free(xprt);
 
-	dprintk("RPC:       %s: returning\n", __func__);
-
 	module_put(THIS_MODULE);
 }
 
@@ -361,9 +355,7 @@ xprt_setup_rdma(struct xprt_create *args)
 	/*
 	 * Set up RDMA-specific connect data.
 	 */
-
-	sap = (struct sockaddr *)&cdata.addr;
-	memcpy(sap, args->dstaddr, args->addrlen);
+	sap = args->dstaddr;
 
 	/* Ensure xprt->addr holds valid server TCP (not RDMA)
 	 * address, for any side protocols which peek at it */
@@ -373,6 +365,7 @@ xprt_setup_rdma(struct xprt_create *args)
 
 	if (rpc_get_port(sap))
 		xprt_set_bound(xprt);
+	xprt_rdma_format_addresses(xprt, sap);
 
 	cdata.max_requests = xprt->max_reqs;
 
@@ -387,8 +380,6 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (cdata.inline_rsize > cdata.rsize)
 		cdata.inline_rsize = cdata.rsize;
 
-	cdata.padding = xprt_rdma_inline_write_padding;
-
 	/*
 	 * Create new transport instance, which includes initialized
 	 *  o ia
@@ -398,7 +389,7 @@ xprt_setup_rdma(struct xprt_create *args)
 
 	new_xprt = rpcx_to_rdmax(xprt);
 
-	rc = rpcrdma_ia_open(new_xprt, sap);
+	rc = rpcrdma_ia_open(new_xprt);
 	if (rc)
 		goto out1;
 
@@ -407,31 +398,19 @@ xprt_setup_rdma(struct xprt_create *args)
 	 */
 	new_xprt->rx_data = cdata;
 	new_ep = &new_xprt->rx_ep;
-	new_ep->rep_remote_addr = cdata.addr;
 
 	rc = rpcrdma_ep_create(&new_xprt->rx_ep,
 				&new_xprt->rx_ia, &new_xprt->rx_data);
 	if (rc)
 		goto out2;
 
-	/*
-	 * Allocate pre-registered send and receive buffers for headers and
-	 * any inline data. Also specify any padding which will be provided
-	 * from a preregistered zero buffer.
-	 */
 	rc = rpcrdma_buffer_create(new_xprt);
 	if (rc)
 		goto out3;
 
-	/*
-	 * Register a callback for connection events. This is necessary because
-	 * connection loss notification is async. We also catch connection loss
-	 * when reaping receives.
-	 */
 	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
 			  xprt_rdma_connect_worker);
 
-	xprt_rdma_format_addresses(xprt, sap);
 	xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
 	if (xprt->max_payload == 0)
 		goto out4;
@@ -445,16 +424,19 @@ xprt_setup_rdma(struct xprt_create *args)
 	dprintk("RPC:       %s: %s:%s\n", __func__,
 		xprt->address_strings[RPC_DISPLAY_ADDR],
 		xprt->address_strings[RPC_DISPLAY_PORT]);
+	trace_xprtrdma_create(new_xprt);
 	return xprt;
 
 out4:
-	xprt_rdma_free_addresses(xprt);
-	rc = -EINVAL;
+	rpcrdma_buffer_destroy(&new_xprt->rx_buf);
+	rc = -ENODEV;
 out3:
 	rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
 out2:
 	rpcrdma_ia_close(&new_xprt->rx_ia);
 out1:
+	trace_xprtrdma_destroy(new_xprt);
+	xprt_rdma_free_addresses(xprt);
 	xprt_free(xprt);
 	return ERR_PTR(rc);
 }
@@ -488,16 +470,34 @@ xprt_rdma_close(struct rpc_xprt *xprt)
 	rpcrdma_ep_disconnect(ep, ia);
 }
 
+/**
+ * xprt_rdma_set_port - update server port with rpcbind result
+ * @xprt: controlling RPC transport
+ * @port: new port value
+ *
+ * Transport connect status is unchanged.
+ */
 static void
 xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
 {
-	struct sockaddr_in *sap;
+	struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
+	char buf[8];
 
-	sap = (struct sockaddr_in *)&xprt->addr;
-	sap->sin_port = htons(port);
-	sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
-	sap->sin_port = htons(port);
-	dprintk("RPC:       %s: %u\n", __func__, port);
+	dprintk("RPC:       %s: setting port for xprt %p (%s:%s) to %u\n",
+		__func__, xprt,
+		xprt->address_strings[RPC_DISPLAY_ADDR],
+		xprt->address_strings[RPC_DISPLAY_PORT],
+		port);
+
+	rpc_set_port(sap, port);
+
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+	snprintf(buf, sizeof(buf), "%u", port);
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+	snprintf(buf, sizeof(buf), "%4hx", port);
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
 }
 
 /**
@@ -516,8 +516,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
 static void
 xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
 {
-	dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt);
-
 	xprt_force_disconnect(xprt);
 }
 
@@ -640,7 +638,7 @@ xprt_rdma_allocate(struct rpc_task *task)
 
 	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
 	if (req == NULL)
-		return -ENOMEM;
+		goto out_get;
 
 	flags = RPCRDMA_DEF_GFP;
 	if (RPC_IS_SWAPPER(task))
@@ -653,19 +651,18 @@ xprt_rdma_allocate(struct rpc_task *task)
 	if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
 		goto out_fail;
 
-	dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
-		task->tk_pid, __func__, rqst->rq_callsize,
-		rqst->rq_rcvsize, req);
-
 	req->rl_cpu = smp_processor_id();
 	req->rl_connect_cookie = 0;	/* our reserved value */
 	rpcrdma_set_xprtdata(rqst, req);
 	rqst->rq_buffer = req->rl_sendbuf->rg_base;
 	rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+	trace_xprtrdma_allocate(task, req);
 	return 0;
 
 out_fail:
 	rpcrdma_buffer_put(req);
+out_get:
+	trace_xprtrdma_allocate(task, NULL);
 	return -ENOMEM;
 }
 
@@ -682,13 +679,9 @@ xprt_rdma_free(struct rpc_task *task)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 
-	if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags))
-		return;
-
-	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
-
 	if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
 		rpcrdma_release_rqst(r_xprt, req);
+	trace_xprtrdma_rpc_done(task, req);
 	rpcrdma_buffer_put(req);
 }
 
@@ -698,22 +691,12 @@ xprt_rdma_free(struct rpc_task *task)
  *
  * Caller holds the transport's write lock.
  *
- * Return values:
- *        0:	The request has been sent
- * ENOTCONN:	Caller needs to invoke connect logic then call again
- *  ENOBUFS:	Call again later to send the request
- *      EIO:	A permanent error occurred. The request was not sent,
- *		and don't try it again
- *
- * send_request invokes the meat of RPC RDMA. It must do the following:
- *
- *  1.  Marshal the RPC request into an RPC RDMA request, which means
- *	putting a header in front of data, and creating IOVs for RDMA
- *	from those in the request.
- *  2.  In marshaling, detect opportunities for RDMA, and use them.
- *  3.  Post a recv message to set up asynch completion, then send
- *	the request (rpcrdma_ep_post).
- *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
+ * Returns:
+ *	%0 if the RPC message has been sent
+ *	%-ENOTCONN if the caller should reconnect and call again
+ *	%-ENOBUFS if the caller should call again later
+ *	%-EIO if a permanent error occurred and the request was not
+ *		sent. Do not try to send this message again.
  */
 static int
 xprt_rdma_send_request(struct rpc_task *task)
@@ -724,14 +707,14 @@ xprt_rdma_send_request(struct rpc_task *task)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	int rc = 0;
 
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (unlikely(!rqst->rq_buffer))
+		return xprt_rdma_bc_send_reply(rqst);
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
 	if (!xprt_connected(xprt))
 		goto drop_connection;
 
-	/* On retransmit, remove any previously registered chunks */
-	if (unlikely(!list_empty(&req->rl_registered)))
-		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
-						    &req->rl_registered);
-
 	rc = rpcrdma_marshal_req(r_xprt, rqst);
 	if (rc < 0)
 		goto failed_marshal;
@@ -744,7 +727,7 @@ xprt_rdma_send_request(struct rpc_task *task)
 		goto drop_connection;
 	req->rl_connect_cookie = xprt->connect_cookie;
 
-	set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
+	__set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
 	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
 		goto drop_connection;
 
@@ -904,8 +887,7 @@ int xprt_rdma_init(void)
 		"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
 		xprt_rdma_slot_table_entries,
 		xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
-	dprintk("\tPadding %d\n\tMemreg %d\n",
-		xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+	dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy);
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	if (!sunrpc_table_header)

+ 149 - 131
net/sunrpc/xprtrdma/verbs.c

@@ -71,8 +71,8 @@
 /*
  * internal functions
  */
-static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf);
+static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
 static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
 
 struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
@@ -108,7 +108,10 @@ static void
 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 {
 	struct rpcrdma_ep *ep = context;
+	struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
+						   rx_ep);
 
+	trace_xprtrdma_qp_error(r_xprt, event);
 	pr_err("rpcrdma: %s on device %s ep %p\n",
 	       ib_event_msg(event->event), event->device->name, context);
 
@@ -133,6 +136,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 		container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_send(sc, wc);
 	if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
 		pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
 		       ib_wc_status_msg(wc->status),
@@ -155,13 +159,11 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 					       rr_cqe);
 
 	/* WARNING: Only wr_id and status are reliable at this point */
+	trace_xprtrdma_wc_receive(rep, wc);
 	if (wc->status != IB_WC_SUCCESS)
 		goto out_fail;
 
 	/* status == SUCCESS means all fields in wc are trustworthy */
-	dprintk("RPC:       %s: rep %p opcode 'recv', length %u: success\n",
-		__func__, rep, wc->byte_len);
-
 	rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
 	rep->rr_wc_flags = wc->wc_flags;
 	rep->rr_inv_rkey = wc->ex.invalidate_rkey;
@@ -192,7 +194,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
 	unsigned int rsize, wsize;
 
 	/* Default settings for RPC-over-RDMA Version One */
-	r_xprt->rx_ia.ri_reminv_expected = false;
 	r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
 	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
@@ -200,7 +201,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
 	if (pmsg &&
 	    pmsg->cp_magic == rpcrdma_cmp_magic &&
 	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
-		r_xprt->rx_ia.ri_reminv_expected = true;
 		r_xprt->rx_ia.ri_implicit_roundup = true;
 		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
 		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
@@ -221,11 +221,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 	struct rpcrdma_xprt *xprt = id->context;
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	struct rpcrdma_ep *ep = &xprt->rx_ep;
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-	struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
-#endif
 	int connstate = 0;
 
+	trace_xprtrdma_conn_upcall(xprt, event);
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
@@ -234,21 +232,17 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 		break;
 	case RDMA_CM_EVENT_ADDR_ERROR:
 		ia->ri_async_rc = -EHOSTUNREACH;
-		dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
-			__func__, ep);
 		complete(&ia->ri_done);
 		break;
 	case RDMA_CM_EVENT_ROUTE_ERROR:
 		ia->ri_async_rc = -ENETUNREACH;
-		dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
-			__func__, ep);
 		complete(&ia->ri_done);
 		break;
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-		pr_info("rpcrdma: removing device %s for %pIS:%u\n",
+		pr_info("rpcrdma: removing device %s for %s:%s\n",
 			ia->ri_device->name,
-			sap, rpc_get_port(sap));
+			rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt));
 #endif
 		set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
 		ep->rep_connected = -ENODEV;
@@ -271,8 +265,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 		connstate = -ENETDOWN;
 		goto connected;
 	case RDMA_CM_EVENT_REJECTED:
-		dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n",
-			sap, rpc_get_port(sap),
+		dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
+			rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
 			rdma_reject_msg(id, event->status));
 		connstate = -ECONNREFUSED;
 		if (event->status == IB_CM_REJ_STALE_CONN)
@@ -287,8 +281,9 @@ connected:
 		wake_up_all(&ep->rep_connect_wait);
 		/*FALLTHROUGH*/
 	default:
-		dprintk("RPC:       %s: %pIS:%u on %s/%s (ep 0x%p): %s\n",
-			__func__, sap, rpc_get_port(sap),
+		dprintk("RPC:       %s: %s:%s on %s/%s (ep 0x%p): %s\n",
+			__func__,
+			rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
 			ia->ri_device->name, ia->ri_ops->ro_displayname,
 			ep, rdma_event_msg(event->event));
 		break;
@@ -298,13 +293,14 @@ connected:
 }
 
 static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt,
-			struct rpcrdma_ia *ia, struct sockaddr *addr)
+rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
 {
 	unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
 	struct rdma_cm_id *id;
 	int rc;
 
+	trace_xprtrdma_conn_start(xprt);
+
 	init_completion(&ia->ri_done);
 	init_completion(&ia->ri_remove_done);
 
@@ -318,7 +314,9 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	}
 
 	ia->ri_async_rc = -ETIMEDOUT;
-	rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
+	rc = rdma_resolve_addr(id, NULL,
+			       (struct sockaddr *)&xprt->rx_xprt.addr,
+			       RDMA_RESOLVE_TIMEOUT);
 	if (rc) {
 		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
 			__func__, rc);
@@ -326,8 +324,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	}
 	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
 	if (rc < 0) {
-		dprintk("RPC:       %s: wait() exited: %i\n",
-			__func__, rc);
+		trace_xprtrdma_conn_tout(xprt);
 		goto out;
 	}
 
@@ -344,8 +341,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	}
 	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
 	if (rc < 0) {
-		dprintk("RPC:       %s: wait() exited: %i\n",
-			__func__, rc);
+		trace_xprtrdma_conn_tout(xprt);
 		goto out;
 	}
 	rc = ia->ri_async_rc;
@@ -365,19 +361,18 @@ out:
 
 /**
  * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: controlling transport
- * @addr: IP address of remote peer
+ * @xprt: transport with IA to (re)initialize
  *
  * Returns 0 on success, negative errno if an appropriate
  * Interface Adapter could not be found and opened.
  */
 int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
 {
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	int rc;
 
-	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
+	ia->ri_id = rpcrdma_create_id(xprt, ia);
 	if (IS_ERR(ia->ri_id)) {
 		rc = PTR_ERR(ia->ri_id);
 		goto out_err;
@@ -392,7 +387,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
 	}
 
 	switch (xprt_rdma_memreg_strategy) {
-	case RPCRDMA_FRMR:
+	case RPCRDMA_FRWR:
 		if (frwr_is_supported(ia)) {
 			ia->ri_ops = &rpcrdma_frwr_memreg_ops;
 			break;
@@ -462,10 +457,12 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
 		rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
 		rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
 	}
-	rpcrdma_destroy_mrs(buf);
+	rpcrdma_mrs_destroy(buf);
 
 	/* Allow waiters to continue */
 	complete(&ia->ri_remove_done);
+
+	trace_xprtrdma_remove(r_xprt);
 }
 
 /**
@@ -476,7 +473,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
 void
 rpcrdma_ia_close(struct rpcrdma_ia *ia)
 {
-	dprintk("RPC:       %s: entering\n", __func__);
 	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 		if (ia->ri_id->qp)
 			rdma_destroy_qp(ia->ri_id);
@@ -630,9 +626,6 @@ out1:
 void
 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-	dprintk("RPC:       %s: entering, connected is %d\n",
-		__func__, ep->rep_connected);
-
 	cancel_delayed_work_sync(&ep->rep_connect_worker);
 
 	if (ia->ri_id->qp) {
@@ -653,13 +646,12 @@ static int
 rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
 			 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-	struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
 	int rc, err;
 
-	pr_info("%s: r_xprt = %p\n", __func__, r_xprt);
+	trace_xprtrdma_reinsert(r_xprt);
 
 	rc = -EHOSTUNREACH;
-	if (rpcrdma_ia_open(r_xprt, sap))
+	if (rpcrdma_ia_open(r_xprt))
 		goto out1;
 
 	rc = -ENOMEM;
@@ -676,7 +668,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
 		goto out3;
 	}
 
-	rpcrdma_create_mrs(r_xprt);
+	rpcrdma_mrs_create(r_xprt);
 	return 0;
 
 out3:
@@ -691,16 +683,15 @@ static int
 rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
 		     struct rpcrdma_ia *ia)
 {
-	struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
 	struct rdma_cm_id *id, *old;
 	int err, rc;
 
-	dprintk("RPC:       %s: reconnecting...\n", __func__);
+	trace_xprtrdma_reconnect(r_xprt);
 
 	rpcrdma_ep_disconnect(ep, ia);
 
 	rc = -EHOSTUNREACH;
-	id = rpcrdma_create_id(r_xprt, ia, sap);
+	id = rpcrdma_create_id(r_xprt, ia);
 	if (IS_ERR(id))
 		goto out;
 
@@ -817,16 +808,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	int rc;
 
 	rc = rdma_disconnect(ia->ri_id);
-	if (!rc) {
+	if (!rc)
 		/* returns without wait if not connected */
 		wait_event_interruptible(ep->rep_connect_wait,
 							ep->rep_connected != 1);
-		dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
-			(ep->rep_connected == 1) ? "still " : "dis");
-	} else {
-		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
+	else
 		ep->rep_connected = rc;
-	}
+	trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt,
+					       rx_ep), rc);
 
 	ib_drain_qp(ia->ri_id->qp);
 }
@@ -998,15 +987,15 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
 {
 	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
 						  rb_recovery_worker.work);
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 
 	spin_lock(&buf->rb_recovery_lock);
 	while (!list_empty(&buf->rb_stale_mrs)) {
-		mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
+		mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
 		spin_unlock(&buf->rb_recovery_lock);
 
-		dprintk("RPC:       %s: recovering MR %p\n", __func__, mw);
-		mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
+		trace_xprtrdma_recover_mr(mr);
+		mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
 
 		spin_lock(&buf->rb_recovery_lock);
 	}
@@ -1014,20 +1003,20 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
 }
 
 void
-rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
+rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
 {
-	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 
 	spin_lock(&buf->rb_recovery_lock);
-	rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
+	rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
 	spin_unlock(&buf->rb_recovery_lock);
 
 	schedule_delayed_work(&buf->rb_recovery_worker, 0);
 }
 
 static void
-rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
+rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
@@ -1036,32 +1025,32 @@ rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
 	LIST_HEAD(all);
 
 	for (count = 0; count < 32; count++) {
-		struct rpcrdma_mw *mw;
+		struct rpcrdma_mr *mr;
 		int rc;
 
-		mw = kzalloc(sizeof(*mw), GFP_KERNEL);
-		if (!mw)
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		if (!mr)
 			break;
 
-		rc = ia->ri_ops->ro_init_mr(ia, mw);
+		rc = ia->ri_ops->ro_init_mr(ia, mr);
 		if (rc) {
-			kfree(mw);
+			kfree(mr);
 			break;
 		}
 
-		mw->mw_xprt = r_xprt;
+		mr->mr_xprt = r_xprt;
 
-		list_add(&mw->mw_list, &free);
-		list_add(&mw->mw_all, &all);
+		list_add(&mr->mr_list, &free);
+		list_add(&mr->mr_all, &all);
 	}
 
-	spin_lock(&buf->rb_mwlock);
-	list_splice(&free, &buf->rb_mws);
+	spin_lock(&buf->rb_mrlock);
+	list_splice(&free, &buf->rb_mrs);
 	list_splice(&all, &buf->rb_all);
 	r_xprt->rx_stats.mrs_allocated += count;
-	spin_unlock(&buf->rb_mwlock);
+	spin_unlock(&buf->rb_mrlock);
 
-	dprintk("RPC:       %s: created %u MRs\n", __func__, count);
+	trace_xprtrdma_createmrs(r_xprt, count);
 }
 
 static void
@@ -1072,7 +1061,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
 	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
 						   rx_buf);
 
-	rpcrdma_create_mrs(r_xprt);
+	rpcrdma_mrs_create(r_xprt);
 }
 
 struct rpcrdma_req *
@@ -1093,10 +1082,17 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
 	return req;
 }
 
-struct rpcrdma_rep *
+/**
+ * rpcrdma_create_rep - Allocate an rpcrdma_rep object
+ * @r_xprt: controlling transport
+ *
+ * Returns 0 on success or a negative errno on failure.
+ */
+int
 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_rep *rep;
 	int rc;
 
@@ -1121,12 +1117,18 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
 	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
 	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
 	rep->rr_recv_wr.num_sge = 1;
-	return rep;
+
+	spin_lock(&buf->rb_lock);
+	list_add(&rep->rr_list, &buf->rb_recv_bufs);
+	spin_unlock(&buf->rb_lock);
+	return 0;
 
 out_free:
 	kfree(rep);
 out:
-	return ERR_PTR(rc);
+	dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+		__func__, rc);
+	return rc;
 }
 
 int
@@ -1137,10 +1139,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 
 	buf->rb_max_requests = r_xprt->rx_data.max_requests;
 	buf->rb_bc_srv_max_requests = 0;
-	spin_lock_init(&buf->rb_mwlock);
+	spin_lock_init(&buf->rb_mrlock);
 	spin_lock_init(&buf->rb_lock);
 	spin_lock_init(&buf->rb_recovery_lock);
-	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_mrs);
 	INIT_LIST_HEAD(&buf->rb_all);
 	INIT_LIST_HEAD(&buf->rb_stale_mrs);
 	INIT_DELAYED_WORK(&buf->rb_refresh_worker,
@@ -1148,7 +1150,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	INIT_DELAYED_WORK(&buf->rb_recovery_worker,
 			  rpcrdma_mr_recovery_worker);
 
-	rpcrdma_create_mrs(r_xprt);
+	rpcrdma_mrs_create(r_xprt);
 
 	INIT_LIST_HEAD(&buf->rb_send_bufs);
 	INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -1167,17 +1169,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	}
 
 	INIT_LIST_HEAD(&buf->rb_recv_bufs);
-	for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) {
-		struct rpcrdma_rep *rep;
-
-		rep = rpcrdma_create_rep(r_xprt);
-		if (IS_ERR(rep)) {
-			dprintk("RPC:       %s: reply buffer %d alloc failed\n",
-				__func__, i);
-			rc = PTR_ERR(rep);
+	for (i = 0; i <= buf->rb_max_requests; i++) {
+		rc = rpcrdma_create_rep(r_xprt);
+		if (rc)
 			goto out;
-		}
-		list_add(&rep->rr_list, &buf->rb_recv_bufs);
 	}
 
 	rc = rpcrdma_sendctxs_create(r_xprt);
@@ -1229,26 +1224,26 @@ rpcrdma_destroy_req(struct rpcrdma_req *req)
 }
 
 static void
-rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
+rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
 {
 	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
 						   rx_buf);
 	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 	unsigned int count;
 
 	count = 0;
-	spin_lock(&buf->rb_mwlock);
+	spin_lock(&buf->rb_mrlock);
 	while (!list_empty(&buf->rb_all)) {
-		mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-		list_del(&mw->mw_all);
+		mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
+		list_del(&mr->mr_all);
 
-		spin_unlock(&buf->rb_mwlock);
-		ia->ri_ops->ro_release_mr(mw);
+		spin_unlock(&buf->rb_mrlock);
+		ia->ri_ops->ro_release_mr(mr);
 		count++;
-		spin_lock(&buf->rb_mwlock);
+		spin_lock(&buf->rb_mrlock);
 	}
-	spin_unlock(&buf->rb_mwlock);
+	spin_unlock(&buf->rb_mrlock);
 	r_xprt->rx_stats.mrs_allocated = 0;
 
 	dprintk("RPC:       %s: released %u MRs\n", __func__, count);
@@ -1285,27 +1280,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 	spin_unlock(&buf->rb_reqslock);
 	buf->rb_recv_count = 0;
 
-	rpcrdma_destroy_mrs(buf);
+	rpcrdma_mrs_destroy(buf);
 }
 
-struct rpcrdma_mw *
-rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
+/**
+ * rpcrdma_mr_get - Allocate an rpcrdma_mr object
+ * @r_xprt: controlling transport
+ *
+ * Returns an initialized rpcrdma_mr or NULL if no free
+ * rpcrdma_mr objects are available.
+ */
+struct rpcrdma_mr *
+rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct rpcrdma_mw *mw = NULL;
+	struct rpcrdma_mr *mr = NULL;
 
-	spin_lock(&buf->rb_mwlock);
-	if (!list_empty(&buf->rb_mws))
-		mw = rpcrdma_pop_mw(&buf->rb_mws);
-	spin_unlock(&buf->rb_mwlock);
+	spin_lock(&buf->rb_mrlock);
+	if (!list_empty(&buf->rb_mrs))
+		mr = rpcrdma_mr_pop(&buf->rb_mrs);
+	spin_unlock(&buf->rb_mrlock);
 
-	if (!mw)
-		goto out_nomws;
-	mw->mw_flags = 0;
-	return mw;
+	if (!mr)
+		goto out_nomrs;
+	return mr;
 
-out_nomws:
-	dprintk("RPC:       %s: no MWs available\n", __func__);
+out_nomrs:
+	trace_xprtrdma_nomrs(r_xprt);
 	if (r_xprt->rx_ep.rep_connected != -ENODEV)
 		schedule_delayed_work(&buf->rb_refresh_worker, 0);
 
@@ -1315,14 +1316,39 @@ out_nomws:
 	return NULL;
 }
 
+static void
+__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
+{
+	spin_lock(&buf->rb_mrlock);
+	rpcrdma_mr_push(mr, &buf->rb_mrs);
+	spin_unlock(&buf->rb_mrlock);
+}
+
+/**
+ * rpcrdma_mr_put - Release an rpcrdma_mr object
+ * @mr: object to release
+ *
+ */
 void
-rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+rpcrdma_mr_put(struct rpcrdma_mr *mr)
 {
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	__rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
+}
+
+/**
+ * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
+ * @mr: object to release
+ *
+ */
+void
+rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 
-	spin_lock(&buf->rb_mwlock);
-	rpcrdma_push_mw(mw, &buf->rb_mws);
-	spin_unlock(&buf->rb_mwlock);
+	trace_xprtrdma_dma_unmap(mr);
+	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+			mr->mr_sg, mr->mr_nents, mr->mr_dir);
+	__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
 }
 
 static struct rpcrdma_rep *
@@ -1359,11 +1385,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 	req = rpcrdma_buffer_get_req_locked(buffers);
 	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
 	spin_unlock(&buffers->rb_lock);
+
 	return req;
 
 out_reqbuf:
 	spin_unlock(&buffers->rb_lock);
-	pr_warn("RPC:       %s: out of request buffers\n", __func__);
 	return NULL;
 }
 
@@ -1519,9 +1545,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 		req->rl_reply = NULL;
 	}
 
-	dprintk("RPC:       %s: posting %d s/g entries\n",
-		__func__, send_wr->num_sge);
-
 	if (!ep->rep_send_count ||
 	    test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
 		send_wr->send_flags |= IB_SEND_SIGNALED;
@@ -1530,14 +1553,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 		send_wr->send_flags &= ~IB_SEND_SIGNALED;
 		--ep->rep_send_count;
 	}
+
 	rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
+	trace_xprtrdma_post_send(req, rc);
 	if (rc)
-		goto out_postsend_err;
+		return -ENOTCONN;
 	return 0;
-
-out_postsend_err:
-	pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
-	return -ENOTCONN;
 }
 
 int
@@ -1550,23 +1571,20 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 	if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
 		goto out_map;
 	rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
+	trace_xprtrdma_post_recv(rep, rc);
 	if (rc)
-		goto out_postrecv;
+		return -ENOTCONN;
 	return 0;
 
 out_map:
 	pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
 	return -EIO;
-
-out_postrecv:
-	pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
-	return -ENOTCONN;
 }
 
 /**
  * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
  * @r_xprt: transport associated with these backchannel resources
- * @min_reqs: minimum number of incoming requests expected
+ * @count: minimum number of incoming requests expected
  *
  * Returns zero if all requested buffers were posted, or a negative errno.
  */
@@ -1594,7 +1612,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
 
 out_reqbuf:
 	spin_unlock(&buffers->rb_lock);
-	pr_warn("%s: no extra receive buffers\n", __func__);
+	trace_xprtrdma_noreps(r_xprt);
 	return -ENOMEM;
 
 out_rc:

+ 61 - 55
net/sunrpc/xprtrdma/xprt_rdma.h

@@ -73,11 +73,10 @@ struct rpcrdma_ia {
 	struct completion	ri_remove_done;
 	int			ri_async_rc;
 	unsigned int		ri_max_segs;
-	unsigned int		ri_max_frmr_depth;
+	unsigned int		ri_max_frwr_depth;
 	unsigned int		ri_max_inline_write;
 	unsigned int		ri_max_inline_read;
 	unsigned int		ri_max_send_sges;
-	bool			ri_reminv_expected;
 	bool			ri_implicit_roundup;
 	enum ib_mr_type		ri_mrtype;
 	unsigned long		ri_flags;
@@ -101,7 +100,6 @@ struct rpcrdma_ep {
 	wait_queue_head_t 	rep_connect_wait;
 	struct rpcrdma_connect_private	rep_cm_private;
 	struct rdma_conn_param	rep_remote_cma;
-	struct sockaddr_storage	rep_remote_addr;
 	struct delayed_work	rep_connect_worker;
 };
 
@@ -232,29 +230,29 @@ enum {
 };
 
 /*
- * struct rpcrdma_mw - external memory region metadata
+ * struct rpcrdma_mr - external memory region metadata
  *
  * An external memory region is any buffer or page that is registered
  * on the fly (ie, not pre-registered).
  *
- * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During
  * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
  * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
  * track of registration metadata while each RPC is pending.
  * rpcrdma_deregister_external() uses this metadata to unmap and
  * release these resources when an RPC is complete.
  */
-enum rpcrdma_frmr_state {
-	FRMR_IS_INVALID,	/* ready to be used */
-	FRMR_IS_VALID,		/* in use */
-	FRMR_FLUSHED_FR,	/* flushed FASTREG WR */
-	FRMR_FLUSHED_LI,	/* flushed LOCALINV WR */
+enum rpcrdma_frwr_state {
+	FRWR_IS_INVALID,	/* ready to be used */
+	FRWR_IS_VALID,		/* in use */
+	FRWR_FLUSHED_FR,	/* flushed FASTREG WR */
+	FRWR_FLUSHED_LI,	/* flushed LOCALINV WR */
 };
 
-struct rpcrdma_frmr {
+struct rpcrdma_frwr {
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
-	enum rpcrdma_frmr_state		fr_state;
+	enum rpcrdma_frwr_state		fr_state;
 	struct completion		fr_linv_done;
 	union {
 		struct ib_reg_wr	fr_regwr;
@@ -267,26 +265,20 @@ struct rpcrdma_fmr {
 	u64			*fm_physaddrs;
 };
 
-struct rpcrdma_mw {
-	struct list_head	mw_list;
-	struct scatterlist	*mw_sg;
-	int			mw_nents;
-	enum dma_data_direction	mw_dir;
-	unsigned long		mw_flags;
+struct rpcrdma_mr {
+	struct list_head	mr_list;
+	struct scatterlist	*mr_sg;
+	int			mr_nents;
+	enum dma_data_direction	mr_dir;
 	union {
 		struct rpcrdma_fmr	fmr;
-		struct rpcrdma_frmr	frmr;
+		struct rpcrdma_frwr	frwr;
 	};
-	struct rpcrdma_xprt	*mw_xprt;
-	u32			mw_handle;
-	u32			mw_length;
-	u64			mw_offset;
-	struct list_head	mw_all;
-};
-
-/* mw_flags */
-enum {
-	RPCRDMA_MW_F_RI		= 1,
+	struct rpcrdma_xprt	*mr_xprt;
+	u32			mr_handle;
+	u32			mr_length;
+	u64			mr_offset;
+	struct list_head	mr_all;
 };
 
 /*
@@ -362,8 +354,7 @@ struct rpcrdma_req {
 
 /* rl_flags */
 enum {
-	RPCRDMA_REQ_F_BACKCHANNEL = 0,
-	RPCRDMA_REQ_F_PENDING,
+	RPCRDMA_REQ_F_PENDING = 0,
 	RPCRDMA_REQ_F_TX_RESOURCES,
 };
 
@@ -374,25 +365,25 @@ rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
 }
 
 static inline struct rpcrdma_req *
-rpcr_to_rdmar(struct rpc_rqst *rqst)
+rpcr_to_rdmar(const struct rpc_rqst *rqst)
 {
 	return rqst->rq_xprtdata;
 }
 
 static inline void
-rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list)
+rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
 {
-	list_add_tail(&mw->mw_list, list);
+	list_add_tail(&mr->mr_list, list);
 }
 
-static inline struct rpcrdma_mw *
-rpcrdma_pop_mw(struct list_head *list)
+static inline struct rpcrdma_mr *
+rpcrdma_mr_pop(struct list_head *list)
 {
-	struct rpcrdma_mw *mw;
+	struct rpcrdma_mr *mr;
 
-	mw = list_first_entry(list, struct rpcrdma_mw, mw_list);
-	list_del(&mw->mw_list);
-	return mw;
+	mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
+	list_del(&mr->mr_list);
+	return mr;
 }
 
 /*
@@ -402,8 +393,8 @@ rpcrdma_pop_mw(struct list_head *list)
  * One of these is associated with a transport instance
  */
 struct rpcrdma_buffer {
-	spinlock_t		rb_mwlock;	/* protect rb_mws list */
-	struct list_head	rb_mws;
+	spinlock_t		rb_mrlock;	/* protect rb_mrs list */
+	struct list_head	rb_mrs;
 	struct list_head	rb_all;
 
 	unsigned long		rb_sc_head;
@@ -438,13 +429,11 @@ struct rpcrdma_buffer {
  * This data should be set with mount options
  */
 struct rpcrdma_create_data_internal {
-	struct sockaddr_storage	addr;	/* RDMA server address */
 	unsigned int	max_requests;	/* max requests (slots) in flight */
 	unsigned int	rsize;		/* mount rsize - max read hdr+data */
 	unsigned int	wsize;		/* mount wsize - max write hdr+data */
 	unsigned int	inline_rsize;	/* max non-rdma read data payload */
 	unsigned int	inline_wsize;	/* max non-rdma write data payload */
-	unsigned int	padding;	/* non-rdma write header padding */
 };
 
 /*
@@ -484,17 +473,19 @@ struct rpcrdma_memreg_ops {
 	struct rpcrdma_mr_seg *
 			(*ro_map)(struct rpcrdma_xprt *,
 				  struct rpcrdma_mr_seg *, int, bool,
-				  struct rpcrdma_mw **);
+				  struct rpcrdma_mr **);
+	void		(*ro_reminv)(struct rpcrdma_rep *rep,
+				     struct list_head *mrs);
 	void		(*ro_unmap_sync)(struct rpcrdma_xprt *,
 					 struct list_head *);
-	void		(*ro_recover_mr)(struct rpcrdma_mw *);
+	void		(*ro_recover_mr)(struct rpcrdma_mr *mr);
 	int		(*ro_open)(struct rpcrdma_ia *,
 				   struct rpcrdma_ep *,
 				   struct rpcrdma_create_data_internal *);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
 	int		(*ro_init_mr)(struct rpcrdma_ia *,
-				      struct rpcrdma_mw *);
-	void		(*ro_release_mr)(struct rpcrdma_mw *);
+				      struct rpcrdma_mr *);
+	void		(*ro_release_mr)(struct rpcrdma_mr *mr);
 	const char	*ro_displayname;
 	const int	ro_send_w_inv_ok;
 };
@@ -525,6 +516,18 @@ struct rpcrdma_xprt {
 #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
 #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 
+static inline const char *
+rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt)
+{
+	return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR];
+}
+
+static inline const char *
+rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt)
+{
+	return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT];
+}
+
 /* Setting this to 0 ensures interoperability with early servers.
  * Setting this to 1 enhances certain unaligned read/write performance.
  * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
@@ -538,7 +541,7 @@ extern unsigned int xprt_rdma_memreg_strategy;
 /*
  * Interface Adapter calls - xprtrdma/verbs.c
  */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr);
+int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
 void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
 void rpcrdma_ia_close(struct rpcrdma_ia *);
 bool frwr_is_supported(struct rpcrdma_ia *);
@@ -564,22 +567,23 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
  * Buffer calls - xprtrdma/verbs.c
  */
 struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
-struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
 void rpcrdma_destroy_req(struct rpcrdma_req *);
+int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt);
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
 void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
 
-struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
-void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
+struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_mr_put(struct rpcrdma_mr *mr);
+void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
+void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr);
+
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
-void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
-
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
 					    gfp_t);
 bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
@@ -663,7 +667,7 @@ int xprt_rdma_bc_up(struct svc_serv *, struct net *);
 size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
-int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
+int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
 void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
 void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
@@ -671,3 +675,5 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
 extern struct xprt_class xprt_rdma_bc;
 
 #endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
+
+#include <trace/events/rpcrdma.h>