Explorar o código

Merge tag 'nfsd-4.11' of git://linux-nfs.org/~bfields/linux

Pull nfsd updates from Bruce Fields:
 "The nfsd update this round is mainly a lot of miscellaneous cleanups
  and bugfixes.

  A couple changes could theoretically break working setups on upgrade.
  I don't expect complaints in practice, but they seem worth calling out
  just in case:

   - NFS security labels are now off by default; a new security_label
     export flag reenables it per export. But, having them on by default
     is a disaster, as it generally only makes sense if all your clients
     and servers have similar enough selinux policies. Thanks to Jason
     Tibbitts for pointing this out.

   - NFSv4/UDP support is off. It was never really supported, and the
     spec explicitly forbids it. We only ever left it on out of
     laziness; thanks to Jeff Layton for finally fixing that"

* tag 'nfsd-4.11' of git://linux-nfs.org/~bfields/linux: (34 commits)
  nfsd: Fix display of the version string
  nfsd: fix configuration of supported minor versions
  sunrpc: don't register UDP port with rpcbind when version needs congestion control
  nfs/nfsd/sunrpc: enforce transport requirements for NFSv4
  sunrpc: flag transports as having congestion control
  sunrpc: turn bitfield flags in svc_version into bools
  nfsd: remove superfluous KERN_INFO
  nfsd: special case truncates some more
  nfsd: minor nfsd_setattr cleanup
  NFSD: Reserve adequate space for LOCKT operation
  NFSD: Get response size before operation for all RPCs
  nfsd/callback: Drop a useless data copy when comparing sessionid
  nfsd/callback: skip the callback tag
  nfsd/callback: Cleanup callback cred on shutdown
  nfsd/idmap: return nfserr_inval for 0-length names
  SUNRPC/Cache: Always treat the invalid cache as unexpired
  SUNRPC: Drop all entries from cache_detail when cache_purge()
  svcrdma: Poll CQs in "workqueue" mode
  svcrdma: Combine list fields in struct svc_rdma_op_ctxt
  svcrdma: Remove unused sc_dto_q field
  ...
Linus Torvalds %!s(int64=8) %!d(string=hai) anos
pai
achega
8313064c2e

+ 2 - 0
fs/lockd/svc.c

@@ -322,6 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
 		dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_addr = ifa->addr;
+		if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+			sin6.sin6_scope_id = ifa->idev->dev->ifindex;
 		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
 			(struct sockaddr *)&sin6);
 	}

+ 4 - 2
fs/nfs/callback_xdr.c

@@ -1083,7 +1083,8 @@ struct svc_version nfs4_callback_version1 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
-	.vs_hidden = 1,
+	.vs_hidden = true,
+	.vs_need_cong_ctrl = true,
 };
 
 struct svc_version nfs4_callback_version4 = {
@@ -1092,5 +1093,6 @@ struct svc_version nfs4_callback_version4 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
-	.vs_hidden = 1,
+	.vs_hidden = true,
+	.vs_need_cong_ctrl = true,
 };

+ 1 - 0
fs/nfsd/export.c

@@ -1102,6 +1102,7 @@ static struct flags {
 	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
 	{ NFSEXP_V4ROOT, {"v4root", ""}},
 	{ NFSEXP_PNFS, {"pnfs", ""}},
+	{ NFSEXP_SECURITY_LABEL, {"security_label", ""}},
 	{ 0, {"", ""}}
 };
 

+ 0 - 1
fs/nfsd/nfs2acl.c

@@ -376,5 +376,4 @@ struct svc_version	nfsd_acl_version2 = {
 		.vs_proc	= nfsd_acl_procedures2,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
-		.vs_hidden	= 0,
 };

+ 0 - 1
fs/nfsd/nfs3acl.c

@@ -266,6 +266,5 @@ struct svc_version	nfsd_acl_version3 = {
 		.vs_proc	= nfsd_acl_procedures3,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
-		.vs_hidden	= 0,
 };
 

+ 3 - 5
fs/nfsd/nfs3proc.c

@@ -193,11 +193,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->committed = argp->stable;
-	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
-				   argp->offset,
-				   rqstp->rq_vec, argp->vlen,
-				   &cnt,
-				   &resp->committed);
+	nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
+				rqstp->rq_vec, argp->vlen,
+				&cnt, resp->committed);
 	resp->count = cnt;
 	RETURN_STATUS(nfserr);
 }

+ 12 - 7
fs/nfsd/nfs4callback.c

@@ -303,6 +303,7 @@ static int decode_cb_compound4res(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, length + 4);
 	if (unlikely(p == NULL))
 		goto out_overflow;
+	p += XDR_QUADLEN(length);
 	hdr->nops = be32_to_cpup(p);
 	return 0;
 out_overflow:
@@ -396,13 +397,10 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 				    struct nfsd4_callback *cb)
 {
 	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
-	struct nfs4_sessionid id;
-	int status;
+	int status = -ESERVERFAULT;
 	__be32 *p;
 	u32 dummy;
 
-	status = -ESERVERFAULT;
-
 	/*
 	 * If the server returns different values for sessionID, slotID or
 	 * sequence number, the server is looney tunes.
@@ -410,9 +408,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
 	if (unlikely(p == NULL))
 		goto out_overflow;
-	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-	if (memcmp(id.data, session->se_sessionid.data,
-					NFS4_MAX_SESSIONID_LEN) != 0) {
+
+	if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("NFS: %s Invalid session id\n", __func__);
 		goto out;
 	}
@@ -753,6 +750,14 @@ int set_callback_cred(void)
 	return 0;
 }
 
+void cleanup_callback_cred(void)
+{
+	if (callback_cred) {
+		put_rpccred(callback_cred);
+		callback_cred = NULL;
+	}
+}
+
 static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
 {
 	if (clp->cl_minorversion == 0) {

+ 8 - 0
fs/nfsd/nfs4idmap.c

@@ -628,6 +628,10 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 {
 	__be32 status;
 	u32 id = -1;
+
+	if (name == NULL || namelen == 0)
+		return nfserr_inval;
+
 	status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
 	*uid = make_kuid(&init_user_ns, id);
 	if (!uid_valid(*uid))
@@ -641,6 +645,10 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 {
 	__be32 status;
 	u32 id = -1;
+
+	if (name == NULL || namelen == 0)
+		return nfserr_inval;
+
 	status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
 	*gid = make_kgid(&init_user_ns, id);
 	if (!gid_valid(*gid))

+ 75 - 13
fs/nfsd/nfs4proc.c

@@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   u32 *bmval, u32 *writable)
 {
 	struct dentry *dentry = cstate->current_fh.fh_dentry;
+	struct svc_export *exp = cstate->current_fh.fh_export;
 
 	if (!nfsd_attrs_supported(cstate->minorversion, bmval))
 		return nfserr_attrnotsupp;
 	if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
 		return nfserr_attrnotsupp;
+	if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
+			!(exp->ex_flags & NFSEXP_SECURITY_LABEL))
+		return nfserr_attrnotsupp;
 	if (writable && !bmval_is_subset(bmval, writable))
 		return nfserr_inval;
 	if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
@@ -983,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
 				write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
-				&write->wr_how_written);
+				write->wr_how_written);
 	fput(filp);
 
 	write->wr_bytes_written = cnt;
@@ -1838,6 +1842,12 @@ static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd
 	return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
 }
 
+static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	/* ac_supported, ac_resp_access */
+	return (op_encode_hdr_size + 2)* sizeof(__be32);
+}
+
 static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
@@ -1892,6 +1902,11 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
 	return ret;
 }
 
+static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
+}
+
 static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1933,6 +1948,11 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
 		XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
+}
+
 static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1952,11 +1972,23 @@ static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
 		+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
+		* sizeof(__be32);
+}
+
 static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
+		(4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
+}
+
 static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
@@ -2011,6 +2043,19 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 }
 
 #ifdef CONFIG_NFSD_PNFS
+static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 maxcount = 0, rlen = 0;
+
+	maxcount = svc_max_payload(rqstp);
+	rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+
+	return (op_encode_hdr_size +
+		1 /* gd_layout_type*/ +
+		XDR_QUADLEN(rlen) +
+		2 /* gd_notify_types */) * sizeof(__be32);
+}
+
 /*
  * At this stage we don't really know what layout driver will handle the request,
  * so we need to define an arbitrary upper bound here.
@@ -2040,10 +2085,17 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
 }
 #endif /* CONFIG_NFSD_PNFS */
 
+
+static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 3) * sizeof(__be32);
+}
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
 		.op_name = "OP_ACCESS",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize,
 	},
 	[OP_CLOSE] = {
 		.op_func = (nfsd4op_func)nfsd4_close,
@@ -2081,6 +2133,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_GETFH] = {
 		.op_func = (nfsd4op_func)nfsd4_getfh,
 		.op_name = "OP_GETFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize,
 	},
 	[OP_LINK] = {
 		.op_func = (nfsd4op_func)nfsd4_link,
@@ -2099,6 +2152,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_LOCKT] = {
 		.op_func = (nfsd4op_func)nfsd4_lockt,
 		.op_name = "OP_LOCKT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
 	},
 	[OP_LOCKU] = {
 		.op_func = (nfsd4op_func)nfsd4_locku,
@@ -2111,15 +2165,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_func = (nfsd4op_func)nfsd4_lookup,
 		.op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
 		.op_name = "OP_LOOKUP",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_LOOKUPP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookupp,
 		.op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
 		.op_name = "OP_LOOKUPP",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_NVERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_nverify,
 		.op_name = "OP_NVERIFY",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_OPEN] = {
 		.op_func = (nfsd4op_func)nfsd4_open,
@@ -2177,6 +2234,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_READLINK] = {
 		.op_func = (nfsd4op_func)nfsd4_readlink,
 		.op_name = "OP_READLINK",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize,
 	},
 	[OP_REMOVE] = {
 		.op_func = (nfsd4op_func)nfsd4_remove,
@@ -2215,6 +2273,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo,
 		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_SECINFO",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
 	},
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
@@ -2240,6 +2299,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_VERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_verify,
 		.op_name = "OP_VERIFY",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_WRITE] = {
 		.op_func = (nfsd4op_func)nfsd4_write,
@@ -2314,11 +2374,13 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
 		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_SECINFO_NO_NAME",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize,
 	},
 	[OP_TEST_STATEID] = {
 		.op_func = (nfsd4op_func)nfsd4_test_stateid,
 		.op_flags = ALLOWED_WITHOUT_FH,
 		.op_name = "OP_TEST_STATEID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize,
 	},
 	[OP_FREE_STATEID] = {
 		.op_func = (nfsd4op_func)nfsd4_free_stateid,
@@ -2332,6 +2394,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
 		.op_flags = ALLOWED_WITHOUT_FH,
 		.op_name = "OP_GETDEVICEINFO",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize,
 	},
 	[OP_LAYOUTGET] = {
 		.op_func = (nfsd4op_func)nfsd4_layoutget,
@@ -2381,6 +2444,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_SEEK] = {
 		.op_func = (nfsd4op_func)nfsd4_seek,
 		.op_name = "OP_SEEK",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize,
 	},
 };
 
@@ -2425,14 +2489,11 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
 
 int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
-	struct nfsd4_operation *opdesc;
-	nfsd4op_rsize estimator;
-
 	if (op->opnum == OP_ILLEGAL)
 		return op_encode_hdr_size * sizeof(__be32);
-	opdesc = OPDESC(op);
-	estimator = opdesc->op_rsize_bop;
-	return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+
+	BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
+	return OPDESC(op)->op_rsize_bop(rqstp, op);
 }
 
 void warn_on_nonidempotent_op(struct nfsd4_op *op)
@@ -2476,12 +2537,13 @@ static struct svc_procedure		nfsd_procedures4[2] = {
 };
 
 struct svc_version	nfsd_version4 = {
-		.vs_vers	= 4,
-		.vs_nproc	= 2,
-		.vs_proc	= nfsd_procedures4,
-		.vs_dispatch	= nfsd_dispatch,
-		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
-		.vs_rpcb_optnl	= 1,
+	.vs_vers		= 4,
+	.vs_nproc		= 2,
+	.vs_proc		= nfsd_procedures4,
+	.vs_dispatch		= nfsd_dispatch,
+	.vs_xdrsize		= NFS4_SVC_XDRSIZE,
+	.vs_rpcb_optnl		= true,
+	.vs_need_cong_ctrl	= true,
 };
 
 /*

+ 7 - 5
fs/nfsd/nfs4state.c

@@ -2281,7 +2281,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
 out_err:
 	conn->cb_addr.ss_family = AF_UNSPEC;
 	conn->cb_addrlen = 0;
-	dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
+	dprintk("NFSD: this client (clientid %08x/%08x) "
 		"will not receive delegations\n",
 		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
 
@@ -7012,23 +7012,24 @@ nfs4_state_start(void)
 
 	ret = set_callback_cred();
 	if (ret)
-		return -ENOMEM;
+		return ret;
+
 	laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
 	if (laundry_wq == NULL) {
 		ret = -ENOMEM;
-		goto out_recovery;
+		goto out_cleanup_cred;
 	}
 	ret = nfsd4_create_callback_queue();
 	if (ret)
 		goto out_free_laundry;
 
 	set_max_delegations();
-
 	return 0;
 
 out_free_laundry:
 	destroy_workqueue(laundry_wq);
-out_recovery:
+out_cleanup_cred:
+	cleanup_callback_cred();
 	return ret;
 }
 
@@ -7086,6 +7087,7 @@ nfs4_state_shutdown(void)
 {
 	destroy_workqueue(laundry_wq);
 	nfsd4_destroy_callback_queue();
+	cleanup_callback_cred();
 }
 
 static void

+ 18 - 11
fs/nfsd/nfs4xdr.c

@@ -58,7 +58,7 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
-u32 nfsd_suppattrs[3][3] = {
+const u32 nfsd_suppattrs[3][3] = {
 	{NFSD4_SUPPORTED_ATTRS_WORD0,
 	 NFSD4_SUPPORTED_ATTRS_WORD1,
 	 NFSD4_SUPPORTED_ATTRS_WORD2},
@@ -1250,7 +1250,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 	READ_BUF(16);
 	p = xdr_decode_hyper(p, &write->wr_offset);
 	write->wr_stable_how = be32_to_cpup(p++);
-	if (write->wr_stable_how > 2)
+	if (write->wr_stable_how > NFS_FILE_SYNC)
 		goto xdr_error;
 	write->wr_buflen = be32_to_cpup(p++);
 
@@ -1941,12 +1941,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		} else
 			max_reply += nfsd4_max_reply(argp->rqstp, op);
 		/*
-		 * OP_LOCK may return a conflicting lock.  (Special case
-		 * because it will just skip encoding this if it runs
-		 * out of xdr buffer space, and it is the only operation
-		 * that behaves this way.)
+		 * OP_LOCK and OP_LOCKT may return a conflicting lock.
+		 * (Special case because it will just skip encoding this
+		 * if it runs out of xdr buffer space, and it is the only
+		 * operation that behaves this way.)
 		 */
-		if (op->opnum == OP_LOCK)
+		if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT)
 			max_reply += NFS4_OPAQUE_LIMIT;
 
 		if (op->status) {
@@ -1966,9 +1966,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	DECODE_TAIL;
 }
 
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+			     struct svc_export *exp)
 {
-	if (IS_I_VERSION(inode)) {
+	if (exp->ex_flags & NFSEXP_V4ROOT) {
+		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+		*p++ = 0;
+	} else if (IS_I_VERSION(inode)) {
 		p = xdr_encode_hyper(p, inode->i_version);
 	} else {
 		*p++ = cpu_to_be32(stat->ctime.tv_sec);
@@ -2417,8 +2421,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
 	     bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		err = security_inode_getsecctx(d_inode(dentry),
+		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
+			err = security_inode_getsecctx(d_inode(dentry),
 						&context, &contextlen);
+		else
+			err = -EOPNOTSUPP;
 		contextsupport = (err == 0);
 		if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 			if (err == -EOPNOTSUPP)
@@ -2490,7 +2497,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 		p = xdr_reserve_space(xdr, 8);
 		if (!p)
 			goto out_resource;
-		p = encode_change(p, &stat, d_inode(dentry));
+		p = encode_change(p, &stat, d_inode(dentry), exp);
 	}
 	if (bmval0 & FATTR4_WORD0_SIZE) {
 		p = xdr_reserve_space(xdr, 8);

+ 35 - 35
fs/nfsd/nfsctl.c

@@ -536,6 +536,19 @@ out_free:
 	return rv;
 }
 
+static ssize_t
+nfsd_print_version_support(char *buf, int remaining, const char *sep,
+		unsigned vers, unsigned minor)
+{
+	const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u";
+	bool supported = !!nfsd_vers(vers, NFSD_TEST);
+
+	if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST))
+		supported = false;
+	return snprintf(buf, remaining, format, sep,
+			supported ? '+' : '-', vers, minor);
+}
+
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
@@ -561,6 +574,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 		len = qword_get(&mesg, vers, size);
 		if (len <= 0) return -EINVAL;
 		do {
+			enum vers_op cmd;
 			sign = *vers;
 			if (sign == '+' || sign == '-')
 				num = simple_strtol((vers+1), &minorp, 0);
@@ -569,24 +583,22 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 			if (*minorp == '.') {
 				if (num != 4)
 					return -EINVAL;
-				minor = simple_strtoul(minorp+1, NULL, 0);
-				if (minor == 0)
-					return -EINVAL;
-				if (nfsd_minorversion(minor, sign == '-' ?
-						     NFSD_CLEAR : NFSD_SET) < 0)
+				if (kstrtouint(minorp+1, 0, &minor) < 0)
 					return -EINVAL;
-				goto next;
-			}
+			} else
+				minor = 0;
+			cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
 			switch(num) {
 			case 2:
 			case 3:
-			case 4:
-				nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
+				nfsd_vers(num, cmd);
 				break;
+			case 4:
+				if (nfsd_minorversion(minor, cmd) >= 0)
+					break;
 			default:
 				return -EINVAL;
 			}
-		next:
 			vers += len + 1;
 		} while ((len = qword_get(&mesg, vers, size)) > 0);
 		/* If all get turned off, turn them back on, as
@@ -599,35 +611,23 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 	len = 0;
 	sep = "";
 	remaining = SIMPLE_TRANSACTION_LIMIT;
-	for (num=2 ; num <= 4 ; num++)
-		if (nfsd_vers(num, NFSD_AVAIL)) {
-			len = snprintf(buf, remaining, "%s%c%d", sep,
-				       nfsd_vers(num, NFSD_TEST)?'+':'-',
-				       num);
-			sep = " ";
-
-			if (len >= remaining)
-				break;
-			remaining -= len;
-			buf += len;
-			tlen += len;
-		}
-	if (nfsd_vers(4, NFSD_AVAIL))
-		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
-		     minor++) {
-			len = snprintf(buf, remaining, " %c4.%u",
-					(nfsd_vers(4, NFSD_TEST) &&
-					 nfsd_minorversion(minor, NFSD_TEST)) ?
-						'+' : '-',
-					minor);
-
+	for (num=2 ; num <= 4 ; num++) {
+		if (!nfsd_vers(num, NFSD_AVAIL))
+			continue;
+		minor = 0;
+		do {
+			len = nfsd_print_version_support(buf, remaining,
+					sep, num, minor);
 			if (len >= remaining)
-				break;
+				goto out;
 			remaining -= len;
 			buf += len;
 			tlen += len;
-		}
-
+			minor++;
+			sep = " ";
+		} while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION);
+	}
+out:
 	len = snprintf(buf, remaining, "\n");
 	if (len >= remaining)
 		return -EINVAL;

+ 3 - 3
fs/nfsd/nfsd.h

@@ -362,16 +362,16 @@ void		nfsd_lockd_shutdown(void);
 	FATTR4_WORD2_MODE_UMASK | \
 	NFSD4_2_SECURITY_ATTRS)
 
-extern u32 nfsd_suppattrs[3][3];
+extern const u32 nfsd_suppattrs[3][3];
 
-static inline bool bmval_is_subset(u32 *bm1, u32 *bm2)
+static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2)
 {
 	return !((bm1[0] & ~bm2[0]) ||
 	         (bm1[1] & ~bm2[1]) ||
 		 (bm1[2] & ~bm2[2]));
 }
 
-static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval)
+static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
 {
 	return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
 }

+ 2 - 6
fs/nfsd/nfsproc.c

@@ -204,18 +204,14 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 					struct nfsd_attrstat  *resp)
 {
 	__be32	nfserr;
-	int	stable = 1;
 	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
 		argp->len, argp->offset);
 
-	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
-				   argp->offset,
-				   rqstp->rq_vec, argp->vlen,
-			           &cnt,
-				   &stable);
+	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset,
+				rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC);
 	return nfsd_return_attrs(nfserr, resp);
 }
 

+ 16 - 0
fs/nfsd/nfssvc.c

@@ -153,6 +153,18 @@ int nfsd_vers(int vers, enum vers_op change)
 	return 0;
 }
 
+static void
+nfsd_adjust_nfsd_versions4(void)
+{
+	unsigned i;
+
+	for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) {
+		if (nfsd_supported_minorversions[i])
+			return;
+	}
+	nfsd_vers(4, NFSD_CLEAR);
+}
+
 int nfsd_minorversion(u32 minorversion, enum vers_op change)
 {
 	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
@@ -160,9 +172,11 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
 	switch(change) {
 	case NFSD_SET:
 		nfsd_supported_minorversions[minorversion] = true;
+		nfsd_vers(4, NFSD_SET);
 		break;
 	case NFSD_CLEAR:
 		nfsd_supported_minorversions[minorversion] = false;
+		nfsd_adjust_nfsd_versions4();
 		break;
 	case NFSD_TEST:
 		return nfsd_supported_minorversions[minorversion];
@@ -354,6 +368,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
 		dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_addr = ifa->addr;
+		if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+			sin6.sin6_scope_id = ifa->idev->dev->ifindex;
 		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
 	}
 

+ 1 - 0
fs/nfsd/state.h

@@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
 		struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
 extern int set_callback_cred(void);
+extern void cleanup_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);

+ 51 - 53
fs/nfsd/vfs.c

@@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	__be32		err;
 	int		host_err;
 	bool		get_write_count;
-	int		size_change = 0;
+	bool		size_change = (iap->ia_valid & ATTR_SIZE);
 
 	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
 		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	/* Get inode */
 	err = fh_verify(rqstp, fhp, ftype, accmode);
 	if (err)
-		goto out;
+		return err;
 	if (get_write_count) {
 		host_err = fh_want_write(fhp);
 		if (host_err)
-			return nfserrno(host_err);
+			goto out;
 	}
 
 	dentry = fhp->fh_dentry;
@@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		iap->ia_valid &= ~ATTR_MODE;
 
 	if (!iap->ia_valid)
-		goto out;
+		return 0;
 
 	nfsd_sanitize_attrs(inode, iap);
 
+	if (check_guard && guardtime != inode->i_ctime.tv_sec)
+		return nfserr_notsync;
+
 	/*
 	 * The size case is special, it changes the file in addition to the
-	 * attributes.
+	 * attributes, and file systems don't expect it to be mixed with
+	 * "random" attribute changes.  We thus split out the size change
+	 * into a separate call to ->setattr, and do the rest as a separate
+	 * setattr call.
 	 */
-	if (iap->ia_valid & ATTR_SIZE) {
+	if (size_change) {
 		err = nfsd_get_write_access(rqstp, fhp, iap);
 		if (err)
-			goto out;
-		size_change = 1;
+			return err;
+	}
 
+	fh_lock(fhp);
+	if (size_change) {
 		/*
 		 * RFC5661, Section 18.30.4:
 		 *   Changing the size of a file with SETATTR indirectly
@@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		 *
 		 * (and similar for the older RFCs)
 		 */
-		if (iap->ia_size != i_size_read(inode))
-			iap->ia_valid |= ATTR_MTIME;
-	}
+		struct iattr size_attr = {
+			.ia_valid	= ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+			.ia_size	= iap->ia_size,
+		};
 
-	iap->ia_valid |= ATTR_CTIME;
+		host_err = notify_change(dentry, &size_attr, NULL);
+		if (host_err)
+			goto out_unlock;
+		iap->ia_valid &= ~ATTR_SIZE;
 
-	if (check_guard && guardtime != inode->i_ctime.tv_sec) {
-		err = nfserr_notsync;
-		goto out_put_write_access;
+		/*
+		 * Avoid the additional setattr call below if the only other
+		 * attribute that the client sends is the mtime, as we update
+		 * it as part of the size change above.
+		 */
+		if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+			goto out_unlock;
 	}
 
-	fh_lock(fhp);
+	iap->ia_valid |= ATTR_CTIME;
 	host_err = notify_change(dentry, iap, NULL);
-	fh_unlock(fhp);
-	err = nfserrno(host_err);
 
-out_put_write_access:
+out_unlock:
+	fh_unlock(fhp);
 	if (size_change)
 		put_write_access(inode);
-	if (!err)
-		err = nfserrno(commit_metadata(fhp));
 out:
-	return err;
+	if (!host_err)
+		host_err = commit_metadata(fhp);
+	return nfserrno(host_err);
 }
 
 #if defined(CONFIG_NFSD_V4)
@@ -940,14 +955,12 @@ static int wait_for_concurrent_writes(struct file *file)
 __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
-				unsigned long *cnt, int *stablep)
+				unsigned long *cnt, int stable)
 {
 	struct svc_export	*exp;
-	struct inode		*inode;
 	mm_segment_t		oldfs;
 	__be32			err = 0;
 	int			host_err;
-	int			stable = *stablep;
 	int			use_wgather;
 	loff_t			pos = offset;
 	unsigned int		pflags = current->flags;
@@ -962,13 +975,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		 */
 		current->flags |= PF_LESS_THROTTLE;
 
-	inode = file_inode(file);
-	exp   = fhp->fh_export;
-
+	exp = fhp->fh_export;
 	use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
 
 	if (!EX_ISSYNC(exp))
-		stable = 0;
+		stable = NFS_UNSTABLE;
 
 	if (stable && !use_wgather)
 		flags |= RWF_SYNC;
@@ -1035,35 +1046,22 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * N.B. After this call fhp needs an fh_put
  */
 __be32
-nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
-		int *stablep)
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+	   struct kvec *vec, int vlen, unsigned long *cnt, int stable)
 {
-	__be32			err = 0;
+	struct file *file = NULL;
+	__be32 err = 0;
 
 	trace_write_start(rqstp, fhp, offset, vlen);
 
-	if (file) {
-		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-				NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
-		if (err)
-			goto out;
-		trace_write_opened(rqstp, fhp, offset, vlen);
-		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
-				stablep);
-		trace_write_io_done(rqstp, fhp, offset, vlen);
-	} else {
-		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
-		if (err)
-			goto out;
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+	if (err)
+		goto out;
 
-		trace_write_opened(rqstp, fhp, offset, vlen);
-		if (cnt)
-			err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
-					     cnt, stablep);
-		trace_write_io_done(rqstp, fhp, offset, vlen);
-		fput(file);
-	}
+	trace_write_opened(rqstp, fhp, offset, vlen);
+	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
+	trace_write_io_done(rqstp, fhp, offset, vlen);
+	fput(file);
 out:
 	trace_write_done(rqstp, fhp, offset, vlen);
 	return err;

+ 3 - 3
fs/nfsd/vfs.h

@@ -83,12 +83,12 @@ __be32		nfsd_readv(struct file *, loff_t, struct kvec *, int,
 				unsigned long *);
 __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
 				loff_t, struct kvec *, int, unsigned long *);
-__be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
-				loff_t, struct kvec *,int, unsigned long *, int *);
+__be32 		nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
+				struct kvec *, int, unsigned long *, int);
 __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
 				struct kvec *vec, int vlen, unsigned long *cnt,
-				int *stablep);
+				int stable);
 __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
 				char *, int *);
 __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,

+ 5 - 1
include/linux/sunrpc/cache.h

@@ -204,8 +204,11 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 	kref_put(&h->ref, cd->cache_put);
 }
 
-static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
+	if (!test_bit(CACHE_VALID, &h->flags))
+		return false;
+
 	return  (h->expiry_time < seconds_since_boot()) ||
 		(detail->flush_time >= h->last_refresh);
 }
@@ -227,6 +230,7 @@ extern void sunrpc_destroy_cache_detail(struct cache_detail *cd);
 extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
 					umode_t, struct cache_detail *);
 extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
+extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
 
 /* Must store cache_detail in seq_file->private if using next three functions */
 extern void *cache_seq_start(struct seq_file *file, loff_t *pos);

+ 9 - 0
include/linux/sunrpc/rpc_rdma.h

@@ -109,6 +109,15 @@ struct rpcrdma_msg {
 	} rm_body;
 };
 
+/*
+ * XDR sizes, in quads
+ */
+enum {
+	rpcrdma_fixed_maxsz	= 4,
+	rpcrdma_segment_maxsz	= 4,
+	rpcrdma_readchunk_maxsz	= 2 + rpcrdma_segment_maxsz,
+};
+
 /*
  * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
  */

+ 8 - 4
include/linux/sunrpc/svc.h

@@ -400,10 +400,14 @@ struct svc_version {
 	struct svc_procedure *	vs_proc;	/* per-procedure info */
 	u32			vs_xdrsize;	/* xdrsize needed for this version */
 
-	unsigned int		vs_hidden : 1,	/* Don't register with portmapper.
-						 * Only used for nfsacl so far. */
-				vs_rpcb_optnl:1;/* Don't care the result of register.
-						 * Only used for nfsv4. */
+	/* Don't register with rpcbind */
+	bool			vs_hidden;
+
+	/* Don't care if the rpcbind registration fails */
+	bool			vs_rpcb_optnl;
+
+	/* Need xprt with congestion control */
+	bool			vs_need_cong_ctrl;
 
 	/* Override dispatch function (e.g. when caching replies).
 	 * A return value of 0 means drop the request. 

+ 4 - 9
include/linux/sunrpc/svc_rdma.h

@@ -70,7 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
  * completes.
  */
 struct svc_rdma_op_ctxt {
-	struct list_head free;
+	struct list_head list;
 	struct svc_rdma_op_ctxt *read_hdr;
 	struct svc_rdma_fastreg_mr *frmr;
 	int hdr_count;
@@ -78,7 +78,6 @@ struct svc_rdma_op_ctxt {
 	struct ib_cqe cqe;
 	struct ib_cqe reg_cqe;
 	struct ib_cqe inv_cqe;
-	struct list_head dto_q;
 	u32 byte_len;
 	u32 position;
 	struct svcxprt_rdma *xprt;
@@ -141,7 +140,8 @@ struct svcxprt_rdma {
 	atomic_t             sc_sq_avail;	/* SQEs ready to be consumed */
 	unsigned int	     sc_sq_depth;	/* Depth of SQ */
 	unsigned int	     sc_rq_depth;	/* Depth of RQ */
-	u32		     sc_max_requests;	/* Forward credits */
+	__be32		     sc_fc_credits;	/* Forward credits */
+	u32		     sc_max_requests;	/* Max requests */
 	u32		     sc_max_bc_requests;/* Backward credits */
 	int                  sc_max_req_size;	/* Size of each RQ WR buf */
 
@@ -171,7 +171,6 @@ struct svcxprt_rdma {
 
 	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */
 	unsigned long	     sc_flags;
-	struct list_head     sc_dto_q;		/* DTO tasklet I/O pending Q */
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 };
@@ -214,11 +213,7 @@ extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
 					    __be32, __be64, u32);
-extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
-					     struct rpcrdma_msg *,
-					     struct rpcrdma_msg *,
-					     enum rpcrdma_proc);
-extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);

+ 1 - 0
include/linux/sunrpc/svc_xprt.h

@@ -67,6 +67,7 @@ struct svc_xprt {
 #define XPT_CACHE_AUTH	11		/* cache auth info */
 #define XPT_LOCAL	12		/* connection from loopback interface */
 #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL	14		/* has congestion control */
 
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */

+ 3 - 2
include/uapi/linux/nfsd/export.h

@@ -32,7 +32,8 @@
 #define NFSEXP_ASYNC		0x0010
 #define NFSEXP_GATHERED_WRITES	0x0020
 #define NFSEXP_NOREADDIRPLUS    0x0040
-/* 80 100 currently unused */
+#define NFSEXP_SECURITY_LABEL	0x0080
+/* 0x100 currently unused */
 #define NFSEXP_NOHIDE		0x0200
 #define NFSEXP_NOSUBTREECHECK	0x0400
 #define	NFSEXP_NOAUTHNLM	0x0800		/* Don't authenticate NLM requests - just trust */
@@ -53,7 +54,7 @@
 #define NFSEXP_PNFS		0x20000
 
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS		0x3FE7F
+#define NFSEXP_ALLFLAGS		0x3FEFF
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \

+ 2 - 2
net/sunrpc/auth_gss/svcauth_gss.c

@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 	case RPC_GSS_PROC_DESTROY:
 		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
 			goto auth_err;
-		rsci->h.expiry_time = seconds_since_boot();
-		set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		/* Delete the entry from the cache_list and call cache_put */
+		sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
 		if (resv->iov_len + 4 > PAGE_SIZE)
 			goto drop;
 		svc_putnl(resv, RPC_SUCCESS);

+ 38 - 15
net/sunrpc/cache.c

@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
 	cache_purge(cd);
 	spin_lock(&cache_list_lock);
 	write_lock(&cd->hash_lock);
-	if (cd->entries) {
-		write_unlock(&cd->hash_lock);
-		spin_unlock(&cache_list_lock);
-		goto out;
-	}
 	if (current_detail == cd)
 		current_detail = NULL;
 	list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
 		/* module must be being unloaded so its safe to kill the worker */
 		cancel_delayed_work_sync(&cache_cleaner);
 	}
-	return;
-out:
-	printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
 }
 EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
 
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
 
 void cache_purge(struct cache_detail *detail)
 {
-	time_t now = seconds_since_boot();
-	if (detail->flush_time >= now)
-		now = detail->flush_time + 1;
-	/* 'now' is the maximum value any 'last_refresh' can have */
-	detail->flush_time = now;
-	detail->nextcheck = seconds_since_boot();
-	cache_flush();
+	struct cache_head *ch = NULL;
+	struct hlist_head *head = NULL;
+	struct hlist_node *tmp = NULL;
+	int i = 0;
+
+	write_lock(&detail->hash_lock);
+	if (!detail->entries) {
+		write_unlock(&detail->hash_lock);
+		return;
+	}
+
+	dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
+	for (i = 0; i < detail->hash_size; i++) {
+		head = &detail->hash_table[i];
+		hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+			hlist_del_init(&ch->cache_list);
+			detail->entries--;
+
+			set_bit(CACHE_CLEANED, &ch->flags);
+			write_unlock(&detail->hash_lock);
+			cache_fresh_unlocked(ch, detail);
+			cache_put(ch, detail);
+			write_lock(&detail->hash_lock);
+		}
+	}
+	write_unlock(&detail->hash_lock);
 }
 EXPORT_SYMBOL_GPL(cache_purge);
 
@@ -1855,3 +1866,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
 
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+	write_lock(&cd->hash_lock);
+	if (!hlist_unhashed(&h->cache_list)){
+		hlist_del_init(&h->cache_list);
+		cd->entries--;
+		write_unlock(&cd->hash_lock);
+		cache_put(h, cd);
+	} else
+		write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);

+ 23 - 1
net/sunrpc/svc.c

@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
 		for (i = 0; i < progp->pg_nvers; i++) {
 			if (progp->pg_vers[i] == NULL)
 				continue;
-			if (progp->pg_vers[i]->vs_hidden == 0)
+			if (!progp->pg_vers[i]->vs_hidden)
 				return 1;
 		}
 	}
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
 			if (vers->vs_hidden)
 				continue;
 
+			/*
+			 * Don't register a UDP port if we need congestion
+			 * control.
+			 */
+			if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+				continue;
+
 			error = __svc_register(net, progp->pg_name, progp->pg_prog,
 						i, family, proto, port);
 
@@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	  !(versp = progp->pg_vers[vers]))
 		goto err_bad_vers;
 
+	/*
+	 * Some protocol versions (namely NFSv4) require some form of
+	 * congestion control.  (See RFC 7530 section 3.1 paragraph 2)
+	 * In other words, UDP is not allowed. We mark those when setting
+	 * up the svc_xprt, and verify that here.
+	 *
+	 * The spec is not very clear about what error should be returned
+	 * when someone tries to access a server that is listening on UDP
+	 * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+	 * fit.
+	 */
+	if (versp->vs_need_cong_ctrl &&
+	    !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+		goto err_bad_vers;
+
 	procp = versp->vs_proc + proc;
 	if (proc >= versp->vs_nproc || !procp->pc_func)
 		goto err_bad_proc;

+ 1 - 0
net/sunrpc/svcsock.c

@@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
 		      &svsk->sk_xprt, serv);
 	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
 	if (sk->sk_state == TCP_LISTEN) {
 		dprintk("setting up TCP socket for listening\n");
 		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);

+ 9 - 8
net/sunrpc/xprtrdma/svc_rdma_backchannel.c

@@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
 {
 	struct rpc_xprt *xprt = rqst->rq_xprt;
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-	struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+	__be32 *p;
 	int rc;
 
 	/* Space in the send buffer for an RPC/RDMA header is reserved
 	 * via xprt->tsh_size.
 	 */
-	headerp->rm_xid = rqst->rq_xid;
-	headerp->rm_vers = rpcrdma_version;
-	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
-	headerp->rm_type = rdma_msg;
-	headerp->rm_body.rm_chunks[0] = xdr_zero;
-	headerp->rm_body.rm_chunks[1] = xdr_zero;
-	headerp->rm_body.rm_chunks[2] = xdr_zero;
+	p = rqst->rq_buffer;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+	*p++ = rdma_msg;
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p   = xdr_zero;
 
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
 	pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);

+ 106 - 193
net/sunrpc/xprtrdma/svc_rdma_marshal.c

@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Oracle. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -47,102 +48,43 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
-/*
- * Decodes a read chunk list. The expected format is as follows:
- *    descrim  : xdr_one
- *    position : __be32 offset into XDR stream
- *    handle   : __be32 RKEY
- *    . . .
- *  end-of-list: xdr_zero
- */
-static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
 {
-	struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+	__be32 *next;
 
-	while (ch->rc_discrim != xdr_zero) {
-		if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
-		    (unsigned long)vaend) {
-			dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+	while (*p++ != xdr_zero) {
+		next = p + rpcrdma_readchunk_maxsz - 1;
+		if (next > end)
 			return NULL;
-		}
-		ch++;
+		p = next;
 	}
-	return &ch->rc_position;
+	return p;
 }
 
-/*
- * Decodes a write chunk list. The expected format is as follows:
- *    descrim  : xdr_one
- *    nchunks  : <count>
- *       handle   : __be32 RKEY           ---+
- *       length   : __be32 <len of segment>  |
- *       offset   : remove va                + <count>
- *       . . .                               |
- *                                        ---+
- */
-static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
 {
-	unsigned long start, end;
-	int nchunks;
-
-	struct rpcrdma_write_array *ary =
-		(struct rpcrdma_write_array *)va;
+	__be32 *next;
 
-	/* Check for not write-array */
-	if (ary->wc_discrim == xdr_zero)
-		return &ary->wc_nchunks;
-
-	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
-	    (unsigned long)vaend) {
-		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
-		return NULL;
-	}
-	nchunks = be32_to_cpu(ary->wc_nchunks);
-
-	start = (unsigned long)&ary->wc_array[0];
-	end = (unsigned long)vaend;
-	if (nchunks < 0 ||
-	    nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
-	    (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
-		dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
-			ary, nchunks, vaend);
-		return NULL;
+	while (*p++ != xdr_zero) {
+		next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+		if (next > end)
+			return NULL;
+		p = next;
 	}
-	/*
-	 * rs_length is the 2nd 4B field in wc_target and taking its
-	 * address skips the list terminator
-	 */
-	return &ary->wc_array[nchunks].wc_target.rs_length;
+	return p;
 }
 
-static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
 {
-	unsigned long start, end;
-	int nchunks;
-	struct rpcrdma_write_array *ary =
-		(struct rpcrdma_write_array *)va;
-
-	/* Check for no reply-array */
-	if (ary->wc_discrim == xdr_zero)
-		return &ary->wc_nchunks;
-
-	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
-	    (unsigned long)vaend) {
-		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
-		return NULL;
-	}
-	nchunks = be32_to_cpu(ary->wc_nchunks);
-
-	start = (unsigned long)&ary->wc_array[0];
-	end = (unsigned long)vaend;
-	if (nchunks < 0 ||
-	    nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
-	    (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
-		dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
-			ary, nchunks, vaend);
-		return NULL;
+	__be32 *next;
+
+	if (*p++ != xdr_zero) {
+		next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+		if (next > end)
+			return NULL;
+		p = next;
 	}
-	return (__be32 *)&ary->wc_array[nchunks];
+	return p;
 }
 
 /**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
  */
 int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
-	struct rpcrdma_msg *rmsgp;
-	__be32 *va, *vaend;
-	unsigned int len;
-	u32 hdr_len;
+	__be32 *p, *end, *rdma_argp;
+	unsigned int hdr_len;
 
 	/* Verify that there's enough bytes for header + something */
-	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
-		dprintk("svcrdma: header too short = %d\n",
-			rq_arg->len);
-		return -EINVAL;
-	}
+	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+		goto out_short;
 
-	rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
-	if (rmsgp->rm_vers != rpcrdma_version) {
-		dprintk("%s: bad version %u\n", __func__,
-			be32_to_cpu(rmsgp->rm_vers));
-		return -EPROTONOSUPPORT;
-	}
+	rdma_argp = rq_arg->head[0].iov_base;
+	if (*(rdma_argp + 1) != rpcrdma_version)
+		goto out_version;
 
-	switch (be32_to_cpu(rmsgp->rm_type)) {
-	case RDMA_MSG:
-	case RDMA_NOMSG:
+	switch (*(rdma_argp + 3)) {
+	case rdma_msg:
+	case rdma_nomsg:
 		break;
 
-	case RDMA_DONE:
-		/* Just drop it */
-		dprintk("svcrdma: dropping RDMA_DONE message\n");
-		return 0;
-
-	case RDMA_ERROR:
-		/* Possible if this is a backchannel reply.
-		 * XXX: We should cancel this XID, though.
-		 */
-		dprintk("svcrdma: dropping RDMA_ERROR message\n");
-		return 0;
-
-	case RDMA_MSGP:
-		/* Pull in the extra for the padded case, bump our pointer */
-		rmsgp->rm_body.rm_padded.rm_align =
-			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
-		rmsgp->rm_body.rm_padded.rm_thresh =
-			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
-
-		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-		rq_arg->head[0].iov_base = va;
-		len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-		rq_arg->head[0].iov_len -= len;
-		if (len > rq_arg->len)
-			return -EINVAL;
-		return len;
-	default:
-		dprintk("svcrdma: bad rdma procedure (%u)\n",
-			be32_to_cpu(rmsgp->rm_type));
-		return -EINVAL;
-	}
+	case rdma_done:
+		goto out_drop;
 
-	/* The chunk list may contain either a read chunk list or a write
-	 * chunk list and a reply chunk list.
-	 */
-	va = &rmsgp->rm_body.rm_chunks[0];
-	vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
-	va = decode_read_list(va, vaend);
-	if (!va) {
-		dprintk("svcrdma: failed to decode read list\n");
-		return -EINVAL;
-	}
-	va = decode_write_list(va, vaend);
-	if (!va) {
-		dprintk("svcrdma: failed to decode write list\n");
-		return -EINVAL;
-	}
-	va = decode_reply_array(va, vaend);
-	if (!va) {
-		dprintk("svcrdma: failed to decode reply chunk\n");
-		return -EINVAL;
+	case rdma_error:
+		goto out_drop;
+
+	default:
+		goto out_proc;
 	}
 
-	rq_arg->head[0].iov_base = va;
-	hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+	end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+	p = xdr_check_read_list(rdma_argp + 4, end);
+	if (!p)
+		goto out_inval;
+	p = xdr_check_write_list(p, end);
+	if (!p)
+		goto out_inval;
+	p = xdr_check_reply_chunk(p, end);
+	if (!p)
+		goto out_inval;
+	if (p > end)
+		goto out_inval;
+
+	rq_arg->head[0].iov_base = p;
+	hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
 	rq_arg->head[0].iov_len -= hdr_len;
 	return hdr_len;
+
+out_short:
+	dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+	return -EINVAL;
+
+out_version:
+	dprintk("svcrdma: bad xprt version: %u\n",
+		be32_to_cpup(rdma_argp + 1));
+	return -EPROTONOSUPPORT;
+
+out_drop:
+	dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+	return 0;
+
+out_proc:
+	dprintk("svcrdma: bad rdma procedure (%u)\n",
+		be32_to_cpup(rdma_argp + 3));
+	return -EINVAL;
+
+out_inval:
+	dprintk("svcrdma: failed to parse transport header\n");
+	return -EINVAL;
 }
 
 int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 
 	*va++ = rmsgp->rm_xid;
 	*va++ = rmsgp->rm_vers;
-	*va++ = cpu_to_be32(xprt->sc_max_requests);
+	*va++ = xprt->sc_fc_credits;
 	*va++ = rdma_error;
 	*va++ = cpu_to_be32(err);
 	if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 	return (int)((unsigned long)va - (unsigned long)startp);
 }
 
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
 {
-	struct rpcrdma_write_array *wr_ary;
+	unsigned int nsegs;
+	__be32 *p;
 
-	/* There is no read-list in a reply */
+	p = rdma_resp;
 
-	/* skip write list */
-	wr_ary = (struct rpcrdma_write_array *)
-		&rmsgp->rm_body.rm_chunks[1];
-	if (wr_ary->wc_discrim)
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
-			wc_target.rs_length;
-	else
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_nchunks;
-
-	/* skip reply array */
-	if (wr_ary->wc_discrim)
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
-	else
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_nchunks;
-
-	return (unsigned long) wr_ary - (unsigned long) rmsgp;
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p += rpcrdma_fixed_maxsz + 1;
+
+	/* Skip Write list. */
+	while (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	/* Skip Reply chunk. */
+	if (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	return (unsigned long)p - (unsigned long)rdma_resp;
 }
 
 void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
 	seg->rs_offset = rs_offset;
 	seg->rs_length = cpu_to_be32(write_len);
 }
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
-				  struct rpcrdma_msg *rdma_argp,
-				  struct rpcrdma_msg *rdma_resp,
-				  enum rpcrdma_proc rdma_type)
-{
-	rdma_resp->rm_xid = rdma_argp->rm_xid;
-	rdma_resp->rm_vers = rdma_argp->rm_vers;
-	rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
-	rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
-	/* Encode <nul> chunks lists */
-	rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
-	rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
-	rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}

+ 9 - 11
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

@@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	dprintk("svcrdma: rqstp=%p\n", rqstp);
 
-	spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+	spin_lock(&rdma_xprt->sc_rq_dto_lock);
 	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
-		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
-		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+		ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 		rdma_read_complete(rqstp, ctxt);
 		goto complete;
 	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
-		ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 	} else {
 		atomic_inc(&rdma_stat_rq_starve);
 		clear_bit(XPT_DATA, &xprt->xpt_flags);
 		ctxt = NULL;
 	}
-	spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 	if (!ctxt) {
 		/* This is the EAGAIN path. The svc_recv routine will
 		 * return -EAGAIN, the nfsd thread will go to call into

+ 14 - 8
net/sunrpc/xprtrdma/svc_rdma_sendto.c

@@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 
 	/* Prepare the SGE for the RPCRDMA Header */
 	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+	ctxt->sge[0].length =
+	    svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
 	ctxt->sge[0].addr =
 	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
 			    ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	struct rpcrdma_msg *rdma_argp;
 	struct rpcrdma_msg *rdma_resp;
 	struct rpcrdma_write_array *wr_ary, *rp_ary;
-	enum rpcrdma_proc reply_type;
 	int ret;
 	int inline_bytes;
 	struct page *res_page;
 	struct svc_rdma_req_map *vec;
 	u32 inv_rkey;
+	__be32 *p;
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
@@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (!res_page)
 		goto err0;
 	rdma_resp = page_address(res_page);
-	if (rp_ary)
-		reply_type = RDMA_NOMSG;
-	else
-		reply_type = RDMA_MSG;
-	svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
-					 rdma_resp, reply_type);
+
+	p = &rdma_resp->rm_xid;
+	*p++ = rdma_argp->rm_xid;
+	*p++ = rdma_argp->rm_vers;
+	*p++ = rdma->sc_fc_credits;
+	*p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+	/* Start with empty chunks */
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p   = xdr_zero;
 
 	/* Send any write-chunk data and build resp write-list */
 	if (wr_ary) {

+ 37 - 32
net/sunrpc/xprtrdma/svc_rdma_transport.c

@@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
 	ctxt = kmalloc(sizeof(*ctxt), flags);
 	if (ctxt) {
 		ctxt->xprt = xprt;
-		INIT_LIST_HEAD(&ctxt->free);
-		INIT_LIST_HEAD(&ctxt->dto_q);
+		INIT_LIST_HEAD(&ctxt->list);
 	}
 	return ctxt;
 }
@@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
 			dprintk("svcrdma: No memory for RDMA ctxt\n");
 			return false;
 		}
-		list_add(&ctxt->free, &xprt->sc_ctxts);
+		list_add(&ctxt->list, &xprt->sc_ctxts);
 	}
 	return true;
 }
@@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
 	struct svc_rdma_op_ctxt *ctxt = NULL;
 
-	spin_lock_bh(&xprt->sc_ctxt_lock);
+	spin_lock(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used++;
 	if (list_empty(&xprt->sc_ctxts))
 		goto out_empty;
 
 	ctxt = list_first_entry(&xprt->sc_ctxts,
-				struct svc_rdma_op_ctxt, free);
-	list_del_init(&ctxt->free);
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+				struct svc_rdma_op_ctxt, list);
+	list_del(&ctxt->list);
+	spin_unlock(&xprt->sc_ctxt_lock);
 
 out:
 	ctxt->count = 0;
@@ -209,15 +208,15 @@ out_empty:
 	/* Either pre-allocation missed the mark, or send
 	 * queue accounting is broken.
 	 */
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	spin_unlock(&xprt->sc_ctxt_lock);
 
 	ctxt = alloc_ctxt(xprt, GFP_NOIO);
 	if (ctxt)
 		goto out;
 
-	spin_lock_bh(&xprt->sc_ctxt_lock);
+	spin_lock(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used--;
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	spin_unlock(&xprt->sc_ctxt_lock);
 	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
 	return NULL;
 }
@@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 		for (i = 0; i < ctxt->count; i++)
 			put_page(ctxt->pages[i]);
 
-	spin_lock_bh(&xprt->sc_ctxt_lock);
+	spin_lock(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used--;
-	list_add(&ctxt->free, &xprt->sc_ctxts);
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	list_add(&ctxt->list, &xprt->sc_ctxts);
+	spin_unlock(&xprt->sc_ctxt_lock);
 }
 
 static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
 		struct svc_rdma_op_ctxt *ctxt;
 
 		ctxt = list_first_entry(&xprt->sc_ctxts,
-					struct svc_rdma_op_ctxt, free);
-		list_del(&ctxt->free);
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		kfree(ctxt);
 	}
 }
@@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	/* All wc fields are now known to be valid */
 	ctxt->byte_len = wc->byte_len;
 	spin_lock(&xprt->sc_rq_dto_lock);
-	list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+	list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
 	spin_unlock(&xprt->sc_rq_dto_lock);
 
 	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
 
 		read_hdr = ctxt->read_hdr;
 		spin_lock(&xprt->sc_rq_dto_lock);
-		list_add_tail(&read_hdr->dto_q,
+		list_add_tail(&read_hdr->list,
 			      &xprt->sc_read_complete_q);
 		spin_unlock(&xprt->sc_rq_dto_lock);
 
@@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 		return NULL;
 	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
-	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_map_lock);
 
+	/*
+	 * Note that this implies that the underlying transport support
+	 * has some form of congestion control (see RFC 7530 section 3.1
+	 * paragraph 2). For now, we assume that all supported RDMA
+	 * transports are suitable here.
+	 */
+	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
 	if (listener)
 		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
 
@@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_fastreg_mr *frmr = NULL;
 
-	spin_lock_bh(&rdma->sc_frmr_q_lock);
+	spin_lock(&rdma->sc_frmr_q_lock);
 	if (!list_empty(&rdma->sc_frmr_q)) {
 		frmr = list_entry(rdma->sc_frmr_q.next,
 				  struct svc_rdma_fastreg_mr, frmr_list);
 		list_del_init(&frmr->frmr_list);
 		frmr->sg_nents = 0;
 	}
-	spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	spin_unlock(&rdma->sc_frmr_q_lock);
 	if (frmr)
 		return frmr;
 
@@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
 	if (frmr) {
 		ib_dma_unmap_sg(rdma->sc_cm_id->device,
 				frmr->sg, frmr->sg_nents, frmr->direction);
-		spin_lock_bh(&rdma->sc_frmr_q_lock);
+		spin_lock(&rdma->sc_frmr_q_lock);
 		WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
 		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
-		spin_unlock_bh(&rdma->sc_frmr_q_lock);
+		spin_unlock(&rdma->sc_frmr_q_lock);
 	}
 }
 
@@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	newxprt->sc_max_req_size = svcrdma_max_req_size;
 	newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
 					 svcrdma_max_requests);
+	newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
 	newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
 					    svcrdma_max_bc_requests);
 	newxprt->sc_rq_depth = newxprt->sc_max_requests +
@@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		goto errout;
 	}
 	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
-					0, IB_POLL_SOFTIRQ);
+					0, IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_sq_cq)) {
 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
 		goto errout;
 	}
 	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
-					0, IB_POLL_SOFTIRQ);
+					0, IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_rq_cq)) {
 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
 		goto errout;
@@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work)
 	 */
 	while (!list_empty(&rdma->sc_read_complete_q)) {
 		struct svc_rdma_op_ctxt *ctxt;
-		ctxt = list_entry(rdma->sc_read_complete_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma->sc_read_complete_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		svc_rdma_put_context(ctxt, 1);
 	}
 
 	/* Destroy queued, but not processed recv completions */
 	while (!list_empty(&rdma->sc_rq_dto_q)) {
 		struct svc_rdma_op_ctxt *ctxt;
-		ctxt = list_entry(rdma->sc_rq_dto_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		svc_rdma_put_context(ctxt, 1);
 	}