Explorar o código

Merge branch 'nfs-for-2.6.39' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6

* 'nfs-for-2.6.39' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6: (54 commits)
  RPC: killing RPC tasks races fixed
  xprt: remove redundant check
  SUNRPC: Convert struct rpc_xprt to use atomic_t counters
  SUNRPC: Ensure we always run the tk_callback before tk_action
  sunrpc: fix printk format warning
  xprt: remove redundant null check
  nfs: BKL is no longer needed, so remove the include
  NFS: Fix a warning in fs/nfs/idmap.c
  Cleanup: Factor out some cut-and-paste code.
  cleanup: save 60 lines/100 bytes by combining two mostly duplicate functions.
  NFS: account direct-io into task io accounting
  gss:krb5 only include enctype numbers in gm_upcall_enctypes
  RPCRDMA: Fix FRMR registration/invalidate handling.
  RPCRDMA: Fix to XDR page base interpretation in marshalling logic.
  NFSv4: Send unmapped uid/gids to the server when using auth_sys
  NFSv4: Propagate the error NFS4ERR_BADOWNER to nfs4_do_setattr
  NFSv4: cleanup idmapper functions to take an nfs_server argument
  NFSv4: Send unmapped uid/gids to the server if the idmapper fails
  NFSv4: If the server sends us a numeric uid/gid then accept it
  NFSv4.1: reject zero layout with zeroed stripe unit
  ...
Linus Torvalds %!s(int64=14) %!d(string=hai) anos
pai
achega
179198373c

+ 7 - 0
Documentation/filesystems/nfs/pnfs.txt

@@ -46,3 +46,10 @@ data server cache
 file driver devices refer to data servers, which are kept in a module
 file driver devices refer to data servers, which are kept in a module
 level cache.  Its reference is held over the lifetime of the deviceid
 level cache.  Its reference is held over the lifetime of the deviceid
 pointing to it.
 pointing to it.
+
+lseg
+----
+lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
+bit which holds it in the pnfs_layout_hdr's list.  When the final lseg
+is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
+bit is set, preventing any new lsegs from being added.

+ 8 - 0
Documentation/kernel-parameters.txt

@@ -1580,6 +1580,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			of returning the full 64-bit number.
 			of returning the full 64-bit number.
 			The default is to return 64-bit inode numbers.
 			The default is to return 64-bit inode numbers.
 
 
+	nfs.nfs4_disable_idmapping=
+			[NFSv4] When set, this option disables the NFSv4
+			idmapper on the client, but only if the mount
+			is using the 'sec=sys' security flavour. This may
+			make migration from legacy NFSv2/v3 systems easier
+			provided that the server has the appropriate support.
+			The default is to always enable NFSv4 idmapping.
+
 	nmi_debug=	[KNL,AVR32,SH] Specify one or more actions to take
 	nmi_debug=	[KNL,AVR32,SH] Specify one or more actions to take
 			when a NMI is triggered.
 			when a NMI is triggered.
 			Format: [state][,regs][,debounce][,die]
 			Format: [state][,regs][,debounce][,die]

+ 1 - 1
fs/nfs/callback_proc.c

@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
 			rv = NFS4ERR_DELAY;
 			rv = NFS4ERR_DELAY;
 		list_del_init(&lo->plh_bulk_recall);
 		list_del_init(&lo->plh_bulk_recall);
 		spin_unlock(&ino->i_lock);
 		spin_unlock(&ino->i_lock);
+		pnfs_free_lseg_list(&free_me_list);
 		put_layout_hdr(lo);
 		put_layout_hdr(lo);
 		iput(ino);
 		iput(ino);
 	}
 	}
-	pnfs_free_lseg_list(&free_me_list);
 	return rv;
 	return rv;
 }
 }
 
 

+ 105 - 26
fs/nfs/client.c

@@ -81,6 +81,11 @@ retry:
 }
 }
 #endif /* CONFIG_NFS_V4 */
 #endif /* CONFIG_NFS_V4 */
 
 
+/*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+
 /*
 /*
  * RPC cruft for NFS
  * RPC cruft for NFS
  */
  */
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
  * Look up a client by IP address and protocol version
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
  * - creates a new record if one doesn't yet exist
  */
  */
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+	       const struct rpc_timeout *timeparms,
+	       const char *ip_addr,
+	       rpc_authflavor_t authflavour,
+	       int noresvport)
 {
 {
 	struct nfs_client *clp, *new = NULL;
 	struct nfs_client *clp, *new = NULL;
 	int error;
 	int error;
@@ -512,6 +522,13 @@ install_client:
 	clp = new;
 	clp = new;
 	list_add(&clp->cl_share_link, &nfs_client_list);
 	list_add(&clp->cl_share_link, &nfs_client_list);
 	spin_unlock(&nfs_client_lock);
 	spin_unlock(&nfs_client_lock);
+
+	error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+					      authflavour, noresvport);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
 	dprintk("--> nfs_get_client() = %p [new]\n", clp);
 	dprintk("--> nfs_get_client() = %p [new]\n", clp);
 	return clp;
 	return clp;
 
 
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 /*
 /*
  * Initialise an NFS2 or NFS3 client
  * Initialise an NFS2 or NFS3 client
  */
  */
-static int nfs_init_client(struct nfs_client *clp,
-			   const struct rpc_timeout *timeparms,
-			   const struct nfs_parsed_mount_data *data)
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+		    const char *ip_addr, rpc_authflavor_t authflavour,
+		    int noresvport)
 {
 {
 	int error;
 	int error;
 
 
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
 	 * - RFC 2623, sec 2.3.2
 	 * - RFC 2623, sec 2.3.2
 	 */
 	 */
 	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
 	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-				      0, data->flags & NFS_MOUNT_NORESVPORT);
+				      0, noresvport);
 	if (error < 0)
 	if (error < 0)
 		goto error;
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
 		cl_init.rpc_ops = &nfs_v3_clientops;
 		cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
 #endif
 
 
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+
 	/* Allocate or find a client reference we can use */
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init);
+	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+			     data->flags & NFS_MOUNT_NORESVPORT);
 	if (IS_ERR(clp)) {
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
 		return PTR_ERR(clp);
 	}
 	}
 
 
-	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
-			data->timeo, data->retrans);
-	error = nfs_init_client(clp, &timeparms, data);
-	if (error < 0)
-		goto error;
-
 	server->nfs_client = clp;
 	server->nfs_client = clp;
 
 
 	/* Initialise the client representation from the mount data */
 	/* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
 	spin_lock(&nfs_client_lock);
 	spin_lock(&nfs_client_lock);
 	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
 	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
 	list_add_tail(&server->master_link, &nfs_volume_list);
 	list_add_tail(&server->master_link, &nfs_volume_list);
+	clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
 	spin_unlock(&nfs_client_lock);
 	spin_unlock(&nfs_client_lock);
 
 
 }
 }
 
 
 static void nfs_server_remove_lists(struct nfs_server *server)
 static void nfs_server_remove_lists(struct nfs_server *server)
 {
 {
+	struct nfs_client *clp = server->nfs_client;
+
 	spin_lock(&nfs_client_lock);
 	spin_lock(&nfs_client_lock);
 	list_del_rcu(&server->client_link);
 	list_del_rcu(&server->client_link);
+	if (clp && list_empty(&clp->cl_superblocks))
+		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
 	list_del(&server->master_link);
 	list_del(&server->master_link);
 	spin_unlock(&nfs_client_lock);
 	spin_unlock(&nfs_client_lock);
 
 
@@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 /*
 /*
  * Initialise an NFS4 client record
  * Initialise an NFS4 client record
  */
  */
-static int nfs4_init_client(struct nfs_client *clp,
-		const struct rpc_timeout *timeparms,
-		const char *ip_addr,
-		rpc_authflavor_t authflavour,
-		int flags)
+int nfs4_init_client(struct nfs_client *clp,
+		     const struct rpc_timeout *timeparms,
+		     const char *ip_addr,
+		     rpc_authflavor_t authflavour,
+		     int noresvport)
 {
 {
 	int error;
 	int error;
 
 
@@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	clp->rpc_ops = &nfs_v4_clientops;
 	clp->rpc_ops = &nfs_v4_clientops;
 
 
 	error = nfs_create_rpc_client(clp, timeparms, authflavour,
 	error = nfs_create_rpc_client(clp, timeparms, authflavour,
-				      1, flags & NFS_MOUNT_NORESVPORT);
+				      1, noresvport);
 	if (error < 0)
 	if (error < 0)
 		goto error;
 		goto error;
 	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
 	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
 	dprintk("--> nfs4_set_client()\n");
 	dprintk("--> nfs4_set_client()\n");
 
 
 	/* Allocate or find a client reference we can use */
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init);
+	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+			     server->flags & NFS_MOUNT_NORESVPORT);
 	if (IS_ERR(clp)) {
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		error = PTR_ERR(clp);
 		goto error;
 		goto error;
 	}
 	}
-	error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
-					server->flags);
-	if (error < 0)
-		goto error_put;
+
+	/*
+	 * Query for the lease time on clientid setup or renewal
+	 *
+	 * Note that this will be set on nfs_clients that were created
+	 * only for the DS role and did not set this bit, but now will
+	 * serve a dual role.
+	 */
+	set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
 
 
 	server->nfs_client = clp;
 	server->nfs_client = clp;
 	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
 	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
 	return 0;
 	return 0;
-
-error_put:
-	nfs_put_client(clp);
 error:
 error:
 	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
 	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
 	return error;
 	return error;
 }
 }
 
 
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+		const struct sockaddr *ds_addr,
+		int ds_addrlen, int ds_proto)
+{
+	struct nfs_client_initdata cl_init = {
+		.addr = ds_addr,
+		.addrlen = ds_addrlen,
+		.rpc_ops = &nfs_v4_clientops,
+		.proto = ds_proto,
+		.minorversion = mds_clp->cl_minorversion,
+	};
+	struct rpc_timeout ds_timeout = {
+		.to_initval = 15 * HZ,
+		.to_maxval = 15 * HZ,
+		.to_retries = 1,
+		.to_exponential = 1,
+	};
+	struct nfs_client *clp;
+
+	/*
+	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+	 * (section 13.1 RFC 5661).
+	 */
+	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+			     mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+
+	dprintk("<-- %s %p\n", __func__, clp);
+	return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
 
 
 /*
 /*
  * Session has been established, and the client marked ready.
  * Session has been established, and the client marked ready.
@@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
 
+	/* data servers support only a subset of NFSv4.1 */
+	if (is_ds_only_client(server->nfs_client))
+		return -EPROTONOSUPPORT;
+
 	fattr = nfs_alloc_fattr();
 	fattr = nfs_alloc_fattr();
 	if (fattr == NULL)
 	if (fattr == NULL)
 		return -ENOMEM;
 		return -ENOMEM;
@@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
 	if (error < 0)
 	if (error < 0)
 		goto error;
 		goto error;
 
 
+	/*
+	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+	 * authentication.
+	 */
+	if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+		server->caps |= NFS_CAP_UIDGID_NOMAP;
+
 	if (data->rsize)
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
 		server->rsize = nfs_block_size(data->rsize, NULL);
 	if (data->wsize)
 	if (data->wsize)
@@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void)
 }
 }
 
 
 #endif /* CONFIG_PROC_FS */
 #endif /* CONFIG_PROC_FS */
+
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+		"Turn off NFSv4 idmapping when using 'sec=sys'");

+ 6 - 2
fs/nfs/direct.c

@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
 
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/nfs_page.h>
@@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 {
 	struct nfs_write_data *data = calldata;
 	struct nfs_write_data *data = calldata;
 
 
-	if (nfs_writeback_done(task, data) != 0)
-		return;
+	nfs_writeback_done(task, data);
 }
 }
 
 
 /*
 /*
@@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (retval)
 	if (retval)
 		goto out;
 		goto out;
 
 
+	task_io_account_read(count);
+
 	retval = nfs_direct_read(iocb, iov, nr_segs, pos);
 	retval = nfs_direct_read(iocb, iov, nr_segs, pos);
 	if (retval > 0)
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 		iocb->ki_pos = pos + retval;
@@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (retval)
 	if (retval)
 		goto out;
 		goto out;
 
 
+	task_io_account_write(count);
+
 	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
 	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
 
 
 	if (retval > 0)
 	if (retval > 0)

+ 0 - 4
fs/nfs/file.c

@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 		mapping->host->i_ino, len, (long long) pos);
 
 
-	pnfs_update_layout(mapping->host,
-			   nfs_file_open_context(file),
-			   IOMODE_RW);
-
 start:
 start:
 	/*
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
 	 * Prevent starvation issues if someone is doing a consistency

+ 72 - 18
fs/nfs/idmap.c

@@ -33,16 +33,41 @@
  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
  */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+	unsigned long val;
+	char buf[16];
+
+	if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+		return 0;
+	memcpy(buf, name, namelen);
+	buf[namelen] = '\0';
+	if (strict_strtoul(buf, 0, &val) != 0)
+		return 0;
+	*res = val;
+	return 1;
+}
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+	return snprintf(buf, buflen, "%u", id);
+}
 
 
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 
 
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_idmap.h>
 #include <linux/nfs_idmap.h>
 #include <linux/keyctl.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <linux/key-type.h>
 #include <linux/rcupdate.h>
 #include <linux/rcupdate.h>
-#include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/err.h>
 
 
 #include <keys/user-type.h>
 #include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
 	return ret;
 	return ret;
 }
 }
 
 
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
 {
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
 	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 }
 }
 
 
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
 {
 {
+	if (nfs_map_string_to_numeric(name, namelen, gid))
+		return 0;
 	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 }
 }
 
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
 {
-	return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
 }
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
 {
-	return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(gid, buf, buflen);
+	return ret;
 }
 }
 
 
 #else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 #else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
 #include <linux/init.h>
-#include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
 	return hash;
 	return hash;
 }
 }
 
 
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
 
 
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
 	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
 }
 
 
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
 
 
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 }
 
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	int ret = -EINVAL;
 
 
-	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
 }
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	int ret = -EINVAL;
 
 
-	return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
 }
 }
 
 
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */

+ 22 - 0
fs/nfs/internal.h

@@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fattr *);
 					   struct nfs_fattr *);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+					     const struct sockaddr *ds_addr,
+					     int ds_addrlen, int ds_proto);
 #ifdef CONFIG_PROC_FS
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
 extern void nfs_fs_proc_exit(void);
@@ -213,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 #endif
 
 
+extern int nfs4_init_ds_session(struct nfs_client *clp);
+
 /* proc.c */
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+			   const struct rpc_timeout *timeparms,
+			   const char *ip_addr, rpc_authflavor_t authflavour,
+			   int noresvport);
 
 
 /* dir.c */
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -262,9 +271,15 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 #endif
 
 
 /* read.c */
 /* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+			     const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 
 /* write.c */
 /* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+			      struct rpc_clnt *clnt,
+			      const struct rpc_call_ops *call_ops,
+			      int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
 extern int nfs_migrate_page(struct address_space *,
@@ -274,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 #endif
 
 
 /* nfs4proc.c */
 /* nfs4proc.c */
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+			    const struct rpc_timeout *timeparms,
+			    const char *ip_addr,
+			    rpc_authflavor_t authflavour,
+			    int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
 extern int _nfs4_call_sync(struct nfs_server *server,
 extern int _nfs4_call_sync(struct nfs_server *server,
 			   struct rpc_message *msg,
 			   struct rpc_message *msg,
 			   struct nfs4_sequence_args *args,
 			   struct nfs4_sequence_args *args,

+ 1 - 0
fs/nfs/nfs3proc.c

@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.lock		= nfs3_proc_lock,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
 	.clear_acl_cache = nfs3_forget_cached_acls,
 	.close_context	= nfs_close_context,
 	.close_context	= nfs_close_context,
+	.init_client	= nfs_init_client,
 };
 };

+ 28 - 0
fs/nfs/nfs4_fs.h

@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs4_setup_sequence(const struct nfs_server *server,
 extern int nfs4_setup_sequence(const struct nfs_server *server,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		int cache_reply, struct rpc_task *task);
 		int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *);
 extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
 		struct nfs_fsinfo *fsinfo);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+		EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
 #else /* CONFIG_NFS_v4_1 */
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 {
 {
@@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
 {
 {
 	return 0;
 	return 0;
 }
 }
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return false;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4_1 */
 
 
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];

+ 331 - 30
fs/nfs/nfs4filelayout.c

@@ -40,32 +40,309 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
 
 
-static int
-filelayout_set_layoutdriver(struct nfs_server *nfss)
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
+
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+			    loff_t offset)
 {
 {
-	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
-						nfs4_fl_free_deviceid_callback);
-	if (status) {
-		printk(KERN_WARNING "%s: deviceid cache could not be "
-			"initialized\n", __func__);
-		return status;
+	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+	u64 tmp;
+
+	offset -= flseg->pattern_offset;
+	tmp = offset;
+	do_div(tmp, stripe_width);
+
+	return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	switch (flseg->stripe_type) {
+	case STRIPE_SPARSE:
+		return offset;
+
+	case STRIPE_DENSE:
+		return filelayout_get_dense_offset(flseg, offset);
 	}
 	}
-	dprintk("%s: deviceid cache has been initialized successfully\n",
-		__func__);
+
+	BUG();
+}
+
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+	} else {
+		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	}
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+					 struct nfs4_state *state,
+					 struct nfs_client *clp,
+					 int *reset)
+{
+	if (task->tk_status >= 0)
+		return 0;
+
+	*reset = 0;
+
+	switch (task->tk_status) {
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		dprintk("%s ERROR %d, Reset session. Exchangeid "
+			"flags 0x%x\n", __func__, task->tk_status,
+			clp->cl_exchange_flags);
+		nfs4_schedule_session_recovery(clp->cl_session);
+		break;
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+	case -EKEYEXPIRED:
+		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+		break;
+	default:
+		dprintk("%s DS error. Retry through MDS %d\n", __func__,
+			task->tk_status);
+		*reset = 1;
+		break;
+	}
+	task->tk_status = 0;
+	return -EAGAIN;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+				struct nfs_read_data *data)
+{
+	struct nfs_client *clp = data->ds_clp;
+	int reset = 0;
+
+	dprintk("%s DS read\n", __func__);
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			filelayout_set_lo_fail(data->lseg);
+			nfs4_reset_read(task, data);
+			clp = NFS_SERVER(data->inode)->nfs_client;
+		}
+		nfs_restart_rpc(task, clp);
+		return -EAGAIN;
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
-/* Clear out the layout by destroying its device list */
-static int
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
 {
-	dprintk("--> %s\n", __func__);
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	rdata->read_done_cb = filelayout_read_done_cb;
+
+	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+				&rdata->args.seq_args, &rdata->res.seq_res,
+				0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+	/* Note this may cause RPC to be resent */
+	rdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_release(void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	rdata->mds_ops->rpc_release(data);
+}
+
+static int filelayout_write_done_cb(struct rpc_task *task,
+				struct nfs_write_data *data)
+{
+	int reset = 0;
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		struct nfs_client *clp;
+
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			filelayout_set_lo_fail(data->lseg);
+			nfs4_reset_write(task, data);
+			clp = NFS_SERVER(data->inode)->nfs_client;
+		} else
+			clp = data->ds_clp;
+		nfs_restart_rpc(task, clp);
+		return -EAGAIN;
+	}
 
 
-	if (nfss->nfs_client->cl_devid_cache)
-		pnfs_put_deviceid_cache(nfss->nfs_client);
 	return 0;
 	return 0;
 }
 }
 
 
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+				&wdata->args.seq_args, &wdata->res.seq_res,
+				0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	/* Note this may cause RPC to be resent */
+	wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_release(void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	wdata->mds_ops->rpc_release(data);
+}
+
+struct rpc_call_ops filelayout_read_call_ops = {
+	.rpc_call_prepare = filelayout_read_prepare,
+	.rpc_call_done = filelayout_read_call_done,
+	.rpc_release = filelayout_read_release,
+};
+
+struct rpc_call_ops filelayout_write_call_ops = {
+	.rpc_call_prepare = filelayout_write_prepare,
+	.rpc_call_done = filelayout_write_call_done,
+	.rpc_release = filelayout_write_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	loff_t offset = data->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+	int status;
+
+	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+		__func__, data->inode->i_ino,
+		data->args.pgbase, (size_t)data->args.count, offset);
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		/* Either layout fh index faulty, or ds connect failed */
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	dprintk("%s USE DS:ip %x %hu\n", __func__,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+	/* No multipath support. Use first DS */
+	data->ds_clp = ds->ds_clp;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		data->args.fh = fh;
+
+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	data->mds_offset = offset;
+
+	/* Perform an asynchronous read to ds */
+	status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+				   &filelayout_read_call_ops);
+	BUG_ON(status != 0);
+	return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	loff_t offset = data->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+	int status;
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+		data->inode->i_ino, sync, (size_t) data->args.count, offset,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+	/* We can't handle commit to ds yet */
+	if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
+		data->args.stable = NFS_FILE_SYNC;
+
+	data->write_done_cb = filelayout_write_done_cb;
+	data->ds_clp = ds->ds_clp;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		data->args.fh = fh;
+	/*
+	 * Get the file offset on the dserver. Set the write offset to
+	 * this offset and save the original offset.
+	 */
+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	data->mds_offset = offset;
+
+	/* Perform an asynchronous write */
+	status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+				    &filelayout_write_call_ops, sync);
+	BUG_ON(status != 0);
+	return PNFS_ATTEMPTED;
+}
+
 /*
 /*
  * filelayout_check_layout()
  * filelayout_check_layout()
  *
  *
@@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	if (fl->stripe_unit % PAGE_SIZE) {
-		dprintk("%s Stripe unit (%u) not page aligned\n",
+	if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+		dprintk("%s Invalid stripe unit (%u)\n",
 			__func__, fl->stripe_unit);
 			__func__, fl->stripe_unit);
 		goto out;
 		goto out;
 	}
 	}
 
 
 	/* find and reference the deviceid */
 	/* find and reference the deviceid */
-	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+	dsaddr = nfs4_fl_find_get_deviceid(id);
 	if (dsaddr == NULL) {
 	if (dsaddr == NULL) {
 		dsaddr = get_device_info(lo->plh_inode, id);
 		dsaddr = get_device_info(lo->plh_inode, id);
 		if (dsaddr == NULL)
 		if (dsaddr == NULL)
@@ -134,7 +411,7 @@ out:
 	dprintk("--> %s returns %d\n", __func__, status);
 	dprintk("--> %s returns %d\n", __func__, status);
 	return status;
 	return status;
 out_put:
 out_put:
-	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+	nfs4_fl_put_deviceid(dsaddr);
 	goto out;
 	goto out;
 }
 }
 
 
@@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
 {
-	struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 
 
 	dprintk("--> %s\n", __func__);
 	dprintk("--> %s\n", __func__);
-	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
-			  &fl->dsaddr->deviceid);
+	nfs4_fl_put_deviceid(fl->dsaddr);
 	_filelayout_free_lseg(fl);
 	_filelayout_free_lseg(fl);
 }
 }
 
 
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 :  coalesce page
+ * return 0 :  don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		   struct nfs_page *req)
+{
+	u64 p_stripe, r_stripe;
+	u32 stripe_unit;
+
+	if (!pgio->pg_lseg)
+		return 1;
+	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+	do_div(p_stripe, stripe_unit);
+	do_div(r_stripe, stripe_unit);
+
+	return (p_stripe == r_stripe);
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 static struct pnfs_layoutdriver_type filelayout_type = {
-	.id = LAYOUT_NFSV4_1_FILES,
-	.name = "LAYOUT_NFSV4_1_FILES",
-	.owner = THIS_MODULE,
-	.set_layoutdriver = filelayout_set_layoutdriver,
-	.clear_layoutdriver = filelayout_clear_layoutdriver,
-	.alloc_lseg              = filelayout_alloc_lseg,
-	.free_lseg               = filelayout_free_lseg,
+	.id			= LAYOUT_NFSV4_1_FILES,
+	.name			= "LAYOUT_NFSV4_1_FILES",
+	.owner			= THIS_MODULE,
+	.alloc_lseg		= filelayout_alloc_lseg,
+	.free_lseg		= filelayout_free_lseg,
+	.pg_test		= filelayout_pg_test,
+	.read_pagelist		= filelayout_read_pagelist,
+	.write_pagelist		= filelayout_write_pagelist,
 };
 };
 
 
 static int __init nfs4filelayout_init(void)
 static int __init nfs4filelayout_init(void)

+ 16 - 3
fs/nfs/nfs4filelayout.h

@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
 	atomic_t		ds_count;
 	atomic_t		ds_count;
 };
 };
 
 
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY	0x00000001
+
 struct nfs4_file_layout_dsaddr {
 struct nfs4_file_layout_dsaddr {
-	struct pnfs_deviceid_node	deviceid;
+	struct hlist_node		node;
+	struct nfs4_deviceid		deviceid;
+	atomic_t			ref;
+	unsigned long			flags;
 	u32				stripe_count;
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u8				*stripe_indices;
 	u32				ds_num;
 	u32				ds_num;
@@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
 			    generic_hdr);
 			    generic_hdr);
 }
 }
 
 
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+					u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
 extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
 
 

+ 224 - 28
fs/nfs/nfs4filelayoutdev.c

@@ -36,6 +36,30 @@
 
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
 
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS	5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE	(1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK	(NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+
 /*
 /*
  * Data server cache
  * Data server cache
  *
  *
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
 	return NULL;
 	return NULL;
 }
 }
 
 
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+	struct nfs_client *clp;
+	struct sockaddr_in sin;
+	int status = 0;
+
+	dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = ds->ds_ip_addr;
+	sin.sin_port = ds->ds_port;
+
+	clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+				 sizeof(sin), IPPROTO_TCP);
+	if (IS_ERR(clp)) {
+		status = PTR_ERR(clp);
+		goto out;
+	}
+
+	if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+		if (!is_ds_client(clp)) {
+			status = -ENODEV;
+			goto out_put;
+		}
+		ds->ds_clp = clp;
+		dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+			ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+		goto out;
+	}
+
+	/*
+	 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+	 * be equal to the MDS lease. Renewal is scheduled in create_session.
+	 */
+	spin_lock(&mds_srv->nfs_client->cl_lock);
+	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+	spin_unlock(&mds_srv->nfs_client->cl_lock);
+	clp->cl_last_renewal = jiffies;
+
+	/* New nfs_client */
+	status = nfs4_init_ds_session(clp);
+	if (status)
+		goto out_put;
+
+	ds->ds_clp = clp;
+	dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+		ntohs(ds->ds_port));
+out:
+	return status;
+out_put:
+	nfs_put_client(clp);
+	goto out;
+}
+
 static void
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
 {
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 	struct nfs4_pnfs_ds *ds;
 	struct nfs4_pnfs_ds *ds;
 	int i;
 	int i;
 
 
-	print_deviceid(&dsaddr->deviceid.de_id);
+	print_deviceid(&dsaddr->deviceid);
 
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
 	for (i = 0; i < dsaddr->ds_num; i++) {
 		ds = dsaddr->ds_list[i];
 		ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 	kfree(dsaddr);
 	kfree(dsaddr);
 }
 }
 
 
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
-	struct nfs4_file_layout_dsaddr *dsaddr =
-		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-
-	nfs4_fl_free_deviceid(dsaddr);
-}
-
 static struct nfs4_pnfs_ds *
 static struct nfs4_pnfs_ds *
 nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
 nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
 {
 {
@@ -300,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
 	dsaddr->stripe_count = cnt;
 	dsaddr->stripe_count = cnt;
 	dsaddr->ds_num = num;
 	dsaddr->ds_num = num;
 
 
-	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+	memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
 
 
 	/* Go back an read stripe indices */
 	/* Go back an read stripe indices */
 	p = indicesp;
 	p = indicesp;
@@ -350,28 +426,37 @@ out_err:
 }
 }
 
 
 /*
 /*
- * Decode the opaque device specified in 'dev'
- * and add it to the list of available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
+ * Decode the opaque device specified in 'dev' and add it to the cache of
+ * available devices.
  */
  */
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 {
 {
-	struct nfs4_file_layout_dsaddr *dsaddr;
-	struct pnfs_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *d, *new;
+	long hash;
 
 
-	dsaddr = decode_device(inode, dev);
-	if (!dsaddr) {
+	new = decode_device(inode, dev);
+	if (!new) {
 		printk(KERN_WARNING "%s: Could not decode or add device\n",
 		printk(KERN_WARNING "%s: Could not decode or add device\n",
 			__func__);
 			__func__);
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
-			      &dsaddr->deviceid);
+	spin_lock(&filelayout_deviceid_lock);
+	d = nfs4_fl_find_get_deviceid(&new->deviceid);
+	if (d) {
+		spin_unlock(&filelayout_deviceid_lock);
+		nfs4_fl_free_deviceid(new);
+		return d;
+	}
+
+	INIT_HLIST_NODE(&new->node);
+	atomic_set(&new->ref, 1);
+	hash = nfs4_fl_deviceid_hash(&new->deviceid);
+	hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+	spin_unlock(&filelayout_deviceid_lock);
 
 
-	return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+	return new;
 }
 }
 
 
 /*
 /*
@@ -446,12 +531,123 @@ out_free:
 	return dsaddr;
 	return dsaddr;
 }
 }
 
 
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+		hlist_del_rcu(&dsaddr->node);
+		spin_unlock(&filelayout_deviceid_lock);
+
+		synchronize_rcu();
+		nfs4_fl_free_deviceid(dsaddr);
+	}
+}
+
 struct nfs4_file_layout_dsaddr *
 struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+{
+	struct nfs4_file_layout_dsaddr *d;
+	struct hlist_node *n;
+	long hash = nfs4_fl_deviceid_hash(id);
+
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+		if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+			if (!atomic_inc_not_zero(&d->ref))
+				goto fail;
+			rcu_read_unlock();
+			return d;
+		}
+	}
+fail:
+	rcu_read_unlock();
+	return NULL;
+}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u64 tmp;
+
+	tmp = offset - flseg->pattern_offset;
+	do_div(tmp, flseg->stripe_unit);
+	tmp += flseg->first_stripe_index;
+	return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u32 i;
+
+	if (flseg->stripe_type == STRIPE_SPARSE) {
+		if (flseg->num_fh == 1)
+			i = 0;
+		else if (flseg->num_fh == 0)
+			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+			return NULL;
+		else
+			i = nfs4_fl_calc_ds_index(lseg, j);
+	} else
+		i = j;
+	return flseg->fh_array[i];
+}
+
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+			       int err, u32 ds_addr)
+{
+	u32 *p = (u32 *)&dsaddr->deviceid;
+
+	printk(KERN_ERR "NFS: data server %x connection error %d."
+		" Deviceid [%x%x%x%x] marked out of use.\n",
+		ds_addr, err, p[0], p[1], p[2], p[3]);
+
+	spin_lock(&filelayout_deviceid_lock);
+	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+	spin_unlock(&filelayout_deviceid_lock);
+}
+
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
 {
-	struct pnfs_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
 
 
-	d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
-	return (d == NULL) ? NULL :
-		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+	if (ds == NULL) {
+		printk(KERN_ERR "%s: No data server for offset index %d\n",
+			__func__, ds_idx);
+		return NULL;
+	}
+
+	if (!ds->ds_clp) {
+		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+		int err;
+
+		if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+			/* Already tried to connect, don't try again */
+			dprintk("%s Deviceid marked out of use\n", __func__);
+			return NULL;
+		}
+		err = nfs4_ds_connect(s, ds);
+		if (err) {
+			filelayout_mark_devid_negative(dsaddr, err,
+						       ntohl(ds->ds_ip_addr));
+			return NULL;
+		}
+	}
+	return ds;
 }
 }

+ 108 - 15
fs/nfs/nfs4proc.c

@@ -85,6 +85,9 @@ static int nfs4_map_errors(int err)
 	switch (err) {
 	switch (err) {
 	case -NFS4ERR_RESOURCE:
 	case -NFS4ERR_RESOURCE:
 		return -EREMOTEIO;
 		return -EREMOTEIO;
+	case -NFS4ERR_BADOWNER:
+	case -NFS4ERR_BADNAME:
+		return -EINVAL;
 	default:
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);
 				__func__, -err);
@@ -241,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
 /* This is the error handling routine for processes that are allowed
  * to sleep.
  * to sleep.
  */
  */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state *state = exception->state;
 	struct nfs4_state *state = exception->state;
@@ -293,6 +296,19 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 				break;
 				break;
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_OLD_STATEID:
 			exception->retry = 1;
 			exception->retry = 1;
+			break;
+		case -NFS4ERR_BADOWNER:
+			/* The following works around a Linux server bug! */
+		case -NFS4ERR_BADNAME:
+			if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+				server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+				exception->retry = 1;
+				printk(KERN_WARNING "NFS: v4 server %s "
+						"does not accept raw "
+						"uid/gids. "
+						"Reenabling the idmapper.\n",
+						server->nfs_client->cl_hostname);
+			}
 	}
 	}
 	/* We failed to handle the error */
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
 	return nfs4_map_errors(ret);
@@ -505,7 +521,7 @@ out:
 	return ret_id;
 	return ret_id;
 }
 }
 
 
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
 				struct nfs4_sequence_args *args,
 				struct nfs4_sequence_args *args,
 				struct nfs4_sequence_res *res,
 				struct nfs4_sequence_res *res,
 				int cache_reply,
 				int cache_reply,
@@ -571,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	res->sr_status = 1;
 	res->sr_status = 1;
 	return 0;
 	return 0;
 }
 }
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 
 
 int nfs4_setup_sequence(const struct nfs_server *server,
 int nfs4_setup_sequence(const struct nfs_server *server,
 			struct nfs4_sequence_args *args,
 			struct nfs4_sequence_args *args,
@@ -1573,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	return 0;
 	return 0;
 }
 }
 
 
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
 {
 {
-	struct nfs_client *clp = server->nfs_client;
 	unsigned int loop;
 	unsigned int loop;
 	int ret;
 	int ret;
 
 
@@ -1592,6 +1608,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
 	return ret;
 	return ret;
 }
 }
 
 
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+	return nfs4_client_recover_expired_lease(server->nfs_client);
+}
+
 /*
 /*
  * OPEN_EXPIRED:
  * OPEN_EXPIRED:
  * 	reclaim state on the server after a network partition.
  * 	reclaim state on the server after a network partition.
@@ -3069,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 	return err;
 }
 }
 
 
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
 
-	dprintk("--> %s\n", __func__);
-
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
-		return -EAGAIN;
-
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
 		nfs_restart_rpc(task, server->nfs_client);
 		nfs_restart_rpc(task, server->nfs_client);
 		return -EAGAIN;
 		return -EAGAIN;
@@ -3089,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 	return 0;
 }
 }
 
 
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+
+	return data->read_done_cb(task, data);
+}
+
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
 {
 	data->timestamp   = jiffies;
 	data->timestamp   = jiffies;
+	data->read_done_cb = nfs4_read_done_cb;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
 }
 
 
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+	dprintk("%s Reset task for i/o through\n", __func__);
+	put_lseg(data->lseg);
+	data->lseg = NULL;
+	/* offsets will differ in the dense stripe case */
+	data->args.offset = data->mds_offset;
+	data->ds_clp = NULL;
+	data->args.fh     = NFS_FH(data->inode);
+	data->read_done_cb = nfs4_read_done_cb;
+	task->tk_ops = data->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
 {
 	struct inode *inode = data->inode;
 	struct inode *inode = data->inode;
 	
 	
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
-		return -EAGAIN;
-
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
 		return -EAGAIN;
 		return -EAGAIN;
@@ -3113,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 	return 0;
 }
 }
 
 
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+	return data->write_done_cb(task, data);
+}
+
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+	dprintk("%s Reset task for i/o through\n", __func__);
+	put_lseg(data->lseg);
+	data->lseg          = NULL;
+	data->ds_clp        = NULL;
+	data->write_done_cb = nfs4_write_done_cb;
+	data->args.fh       = NFS_FH(data->inode);
+	data->args.bitmask  = data->res.server->cache_consistency_bitmask;
+	data->args.offset   = data->mds_offset;
+	data->res.fattr     = &data->fattr;
+	task->tk_ops        = data->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
+
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
 
-	data->args.bitmask = server->cache_consistency_bitmask;
+	if (data->lseg) {
+		data->args.bitmask = NULL;
+		data->res.fattr = NULL;
+	} else
+		data->args.bitmask = server->cache_consistency_bitmask;
+	if (!data->write_done_cb)
+		data->write_done_cb = nfs4_write_done_cb;
 	data->res.server = server;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 	data->timestamp   = jiffies;
 
 
@@ -5118,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server)
 	return ret;
 	return ret;
 }
 }
 
 
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session = clp->cl_session;
+	int ret;
+
+	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+		return 0;
+
+	ret = nfs4_client_recover_expired_lease(clp);
+	if (!ret)
+		/* Test for the DS role */
+		if (!is_ds_client(clp))
+			ret = -ENODEV;
+	if (!ret)
+		ret = nfs4_check_client_ready(clp);
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+
 /*
 /*
  * Renew the cl_session lease.
  * Renew the cl_session lease.
  */
  */
@@ -5648,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.clear_acl_cache = nfs4_zap_acl_attr,
 	.clear_acl_cache = nfs4_zap_acl_attr,
 	.close_context  = nfs4_close_context,
 	.close_context  = nfs4_close_context,
 	.open_context	= nfs4_atomic_open,
 	.open_context	= nfs4_atomic_open,
+	.init_client	= nfs4_init_client,
 };
 };
 
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {

+ 1 - 5
fs/nfs/nfs4renewd.c

@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
 	ops = clp->cl_mvops->state_renewal_ops;
 	ops = clp->cl_mvops->state_renewal_ops;
 	dprintk("%s: start\n", __func__);
 	dprintk("%s: start\n", __func__);
 
 
-	rcu_read_lock();
-	if (list_empty(&clp->cl_superblocks)) {
-		rcu_read_unlock();
+	if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
 		goto out;
 		goto out;
-	}
-	rcu_read_unlock();
 
 
 	spin_lock(&clp->cl_lock);
 	spin_lock(&clp->cl_lock);
 	lease = clp->cl_lease_time;
 	lease = clp->cl_lease_time;

+ 6 - 0
fs/nfs/nfs4state.c

@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 	int status;
 	int status;
 	struct nfs_fsinfo fsinfo;
 	struct nfs_fsinfo fsinfo;
 
 
+	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+		nfs4_schedule_state_renewal(clp);
+		return 0;
+	}
+
 	status = nfs4_proc_get_lease_time(clp, &fsinfo);
 	status = nfs4_proc_get_lease_time(clp, &fsinfo);
 	if (status == 0) {
 	if (status == 0) {
 		/* Update lease time and schedule renewal */
 		/* Update lease time and schedule renewal */
@@ -1448,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session)
 {
 {
 	nfs4_schedule_lease_recovery(session->clp);
 	nfs4_schedule_lease_recovery(session->clp);
 }
 }
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
 
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
 {

+ 20 - 18
fs/nfs/nfs4xdr.c

@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_MODE)
 	if (iap->ia_valid & ATTR_MODE)
 		len += 4;
 		len += 4;
 	if (iap->ia_valid & ATTR_UID) {
 	if (iap->ia_valid & ATTR_UID) {
-		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
 		if (owner_namelen < 0) {
 		if (owner_namelen < 0) {
 			dprintk("nfs: couldn't resolve uid %d to string\n",
 			dprintk("nfs: couldn't resolve uid %d to string\n",
 					iap->ia_uid);
 					iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 	}
 	}
 	if (iap->ia_valid & ATTR_GID) {
 	if (iap->ia_valid & ATTR_GID) {
-		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
 		if (owner_grouplen < 0) {
 		if (owner_grouplen < 0) {
 			dprintk("nfs: couldn't resolve gid %d to string\n",
 			dprintk("nfs: couldn't resolve gid %d to string\n",
 					iap->ia_gid);
 					iap->ia_gid);
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_putrootfh_maxsz;
 	hdr->replen += decode_putrootfh_maxsz;
 }
 }
 
 
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
 {
 {
 	nfs4_stateid stateid;
 	nfs4_stateid stateid;
 	__be32 *p;
 	__be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
 		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+		if (zero_seqid)
+			stateid.stateid.seqid = 0;
 		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
 	} else
 		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	p = reserve_space(xdr, 4);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_READ);
 	*p = cpu_to_be32(OP_READ);
 
 
-	encode_stateid(xdr, args->context, args->lock_context);
+	encode_stateid(xdr, args->context, args->lock_context,
+		       hdr->minorversion);
 
 
 	p = reserve_space(xdr, 12);
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	p = reserve_space(xdr, 4);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_WRITE);
 	*p = cpu_to_be32(OP_WRITE);
 
 
-	encode_stateid(xdr, args->context, args->lock_context);
+	encode_stateid(xdr, args->context, args->lock_context,
+		       hdr->minorversion);
 
 
 	p = reserve_space(xdr, 16);
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	p = xdr_encode_hyper(p, args->offset);
@@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_write(xdr, args, &hdr);
 	encode_write(xdr, args, &hdr);
 	req->rq_snd_buf.flags |= XDRBUF_WRITE;
 	req->rq_snd_buf.flags |= XDRBUF_WRITE;
-	encode_getfattr(xdr, args->bitmask, &hdr);
+	if (args->bitmask)
+		encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 	encode_nops(&hdr);
 }
 }
 
 
@@ -3382,7 +3387,7 @@ out_overflow:
 }
 }
 
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-		struct nfs_client *clp, uint32_t *uid, int may_sleep)
+		const struct nfs_server *server, uint32_t *uid, int may_sleep)
 {
 {
 	uint32_t len;
 	uint32_t len;
 	__be32 *p;
 	__be32 *p;
@@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
 		if (!may_sleep) {
 		if (!may_sleep) {
 			/* do nothing */
 			/* do nothing */
 		} else if (len < XDR_MAX_NETOBJ) {
 		} else if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
 				ret = NFS_ATTR_FATTR_OWNER;
 			else
 			else
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3425,7 @@ out_overflow:
 }
 }
 
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-		struct nfs_client *clp, uint32_t *gid, int may_sleep)
+		const struct nfs_server *server, uint32_t *gid, int may_sleep)
 {
 {
 	uint32_t len;
 	uint32_t len;
 	__be32 *p;
 	__be32 *p;
@@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
 		if (!may_sleep) {
 		if (!may_sleep) {
 			/* do nothing */
 			/* do nothing */
 		} else if (len < XDR_MAX_NETOBJ) {
 		} else if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
 				ret = NFS_ATTR_FATTR_GROUP;
 			else
 			else
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 		goto xdr_error;
 	fattr->valid |= status;
 	fattr->valid |= status;
 
 
-	status = decode_attr_owner(xdr, bitmap, server->nfs_client,
-			&fattr->uid, may_sleep);
+	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
 	if (status < 0)
 	if (status < 0)
 		goto xdr_error;
 		goto xdr_error;
 	fattr->valid |= status;
 	fattr->valid |= status;
 
 
-	status = decode_attr_group(xdr, bitmap, server->nfs_client,
-			&fattr->gid, may_sleep);
+	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
 	if (status < 0)
 	if (status < 0)
 		goto xdr_error;
 		goto xdr_error;
 	fattr->valid |= status;
 	fattr->valid |= status;
@@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_write(xdr, res);
 	status = decode_write(xdr, res);
 	if (status)
 	if (status)
 		goto out;
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	if (res->fattr)
+		decode_getfattr(xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
 	if (!status)
 	if (!status)
 		status = res->count;
 		status = res->count;
 out:
 out:
@@ -6167,8 +6171,6 @@ static struct {
 	{ NFS4ERR_DQUOT,	-EDQUOT		},
 	{ NFS4ERR_DQUOT,	-EDQUOT		},
 	{ NFS4ERR_STALE,	-ESTALE		},
 	{ NFS4ERR_STALE,	-ESTALE		},
 	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
 	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
-	{ NFS4ERR_BADOWNER,	-EINVAL		},
-	{ NFS4ERR_BADNAME,	-EINVAL		},
 	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
 	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},

+ 13 - 9
fs/nfs/pagelist.c

@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs_mount.h>
 
 
 #include "internal.h"
 #include "internal.h"
+#include "pnfs.h"
 
 
 static struct kmem_cache *nfs_page_cachep;
 static struct kmem_cache *nfs_page_cachep;
 
 
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
  */
  */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     struct inode *inode,
 		     struct inode *inode,
-		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+		     int (*doio)(struct nfs_pageio_descriptor *),
 		     size_t bsize,
 		     size_t bsize,
 		     int io_flags)
 		     int io_flags)
 {
 {
@@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_doio = doio;
 	desc->pg_doio = doio;
 	desc->pg_ioflags = io_flags;
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_error = 0;
+	desc->pg_lseg = NULL;
 }
 }
 
 
 /**
 /**
@@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
  * Return 'true' if this is the case, else return 'false'.
  * Return 'true' if this is the case, else return 'false'.
  */
  */
 static int nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_can_coalesce_requests(struct nfs_page *prev,
-				     struct nfs_page *req)
+				     struct nfs_page *req,
+				     struct nfs_pageio_descriptor *pgio)
 {
 {
 	if (req->wb_context->cred != prev->wb_context->cred)
 	if (req->wb_context->cred != prev->wb_context->cred)
 		return 0;
 		return 0;
@@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 		return 0;
 		return 0;
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
 		return 0;
 		return 0;
+	/*
+	 * Non-whole file layouts need to check that req is inside of
+	 * pgio->pg_lseg.
+	 */
+	if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+		return 0;
 	return 1;
 	return 1;
 }
 }
 
 
@@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 		if (newlen > desc->pg_bsize)
 		if (newlen > desc->pg_bsize)
 			return 0;
 			return 0;
 		prev = nfs_list_entry(desc->pg_list.prev);
 		prev = nfs_list_entry(desc->pg_list.prev);
-		if (!nfs_can_coalesce_requests(prev, req))
+		if (!nfs_can_coalesce_requests(prev, req, desc))
 			return 0;
 			return 0;
 	} else
 	} else
 		desc->pg_base = req->wb_pgbase;
 		desc->pg_base = req->wb_pgbase;
@@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
 {
 	if (!list_empty(&desc->pg_list)) {
 	if (!list_empty(&desc->pg_list)) {
-		int error = desc->pg_doio(desc->pg_inode,
-					  &desc->pg_list,
-					  nfs_page_array_len(desc->pg_base,
-							     desc->pg_count),
-					  desc->pg_count,
-					  desc->pg_ioflags);
+		int error = desc->pg_doio(desc);
 		if (error < 0)
 		if (error < 0)
 			desc->pg_error = error;
 			desc->pg_error = error;
 		else
 		else

+ 156 - 174
fs/nfs/pnfs.c

@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs.h>
 #include "internal.h"
 #include "internal.h"
 #include "pnfs.h"
 #include "pnfs.h"
+#include "iostat.h"
 
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 
 
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
 void
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
 {
-	if (nfss->pnfs_curr_ld) {
-		nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+	if (nfss->pnfs_curr_ld)
 		module_put(nfss->pnfs_curr_ld->owner);
 		module_put(nfss->pnfs_curr_ld->owner);
-	}
 	nfss->pnfs_curr_ld = NULL;
 	nfss->pnfs_curr_ld = NULL;
 }
 }
 
 
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
 		goto out_no_driver;
 		goto out_no_driver;
 	}
 	}
 	server->pnfs_curr_ld = ld_type;
 	server->pnfs_curr_ld = ld_type;
-	if (ld_type->set_layoutdriver(server)) {
-		printk(KERN_ERR
-		       "%s: Error initializing mount point for layout driver %u.\n",
-		       __func__, id);
-		module_put(ld_type->owner);
-		goto out_no_driver;
-	}
+
 	dprintk("%s: pNFS module for %u set\n", __func__, id);
 	dprintk("%s: pNFS module for %u set\n", __func__, id);
 	return;
 	return;
 
 
@@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
 	put_layout_hdr(NFS_I(ino)->layout);
 	put_layout_hdr(NFS_I(ino)->layout);
 }
 }
 
 
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
- * could sleep, so must be called outside of the lock.
- * Returns 1 if object was removed, otherwise return 0.
- */
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
-		struct list_head *tmp_list)
+static void
+put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = lseg->pls_layout->plh_inode;
+
+	BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	list_del_init(&lseg->pls_list);
+	if (list_empty(&lseg->pls_layout->plh_segs)) {
+		set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+		/* Matched by initial refcount set in alloc_init_layout_hdr */
+		put_layout_hdr_locked(lseg->pls_layout);
+	}
+	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+void
+put_lseg(struct pnfs_layout_segment *lseg)
 {
 {
+	struct inode *inode;
+
+	if (!lseg)
+		return;
+
 	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 		atomic_read(&lseg->pls_refcount),
 		atomic_read(&lseg->pls_refcount),
 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-	if (atomic_dec_and_test(&lseg->pls_refcount)) {
-		struct inode *ino = lseg->pls_layout->plh_inode;
+	inode = lseg->pls_layout->plh_inode;
+	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+		LIST_HEAD(free_me);
 
 
-		BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-		list_del(&lseg->pls_list);
-		if (list_empty(&lseg->pls_layout->plh_segs)) {
-			struct nfs_client *clp;
-
-			clp = NFS_SERVER(ino)->nfs_client;
-			spin_lock(&clp->cl_lock);
-			/* List does not take a reference, so no need for put here */
-			list_del_init(&lseg->pls_layout->plh_layouts);
-			spin_unlock(&clp->cl_lock);
-			clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
-		}
-		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
-		list_add(&lseg->pls_list, tmp_list);
-		return 1;
+		put_lseg_common(lseg);
+		list_add(&lseg->pls_list, &free_me);
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg_list(&free_me);
 	}
 	}
-	return 0;
 }
 }
 
 
 static bool
 static bool
@@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 		 * list.  It will now be removed when all
 		 * list.  It will now be removed when all
 		 * outstanding io is finished.
 		 * outstanding io is finished.
 		 */
 		 */
-		rv = put_lseg_locked(lseg, tmp_list);
+		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+			atomic_read(&lseg->pls_refcount));
+		if (atomic_dec_and_test(&lseg->pls_refcount)) {
+			put_lseg_common(lseg);
+			list_add(&lseg->pls_list, tmp_list);
+			rv = 1;
+		}
 	}
 	}
 	return rv;
 	return rv;
 }
 }
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 
 
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
 
+	if (list_empty(&lo->plh_segs)) {
+		if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+			put_layout_hdr_locked(lo);
+		return 0;
+	}
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 		if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
 		if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
 			dprintk("%s: freeing lseg %p iomode %d "
 			dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 	return invalid - removed;
 	return invalid - removed;
 }
 }
 
 
+/* note free_me must contain lsegs from a single layout_hdr */
 void
 void
 pnfs_free_lseg_list(struct list_head *free_me)
 pnfs_free_lseg_list(struct list_head *free_me)
 {
 {
 	struct pnfs_layout_segment *lseg, *tmp;
 	struct pnfs_layout_segment *lseg, *tmp;
+	struct pnfs_layout_hdr *lo;
+
+	if (list_empty(free_me))
+		return;
 
 
+	lo = list_first_entry(free_me, struct pnfs_layout_segment,
+			      pls_list)->pls_layout;
+
+	if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+		struct nfs_client *clp;
+
+		clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
 	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 		list_del(&lseg->pls_list);
 		list_del(&lseg->pls_list);
 		free_lseg(lseg);
 		free_lseg(lseg);
@@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	spin_lock(&nfsi->vfs_inode.i_lock);
 	spin_lock(&nfsi->vfs_inode.i_lock);
 	lo = nfsi->layout;
 	lo = nfsi->layout;
 	if (lo) {
 	if (lo) {
-		set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
 		mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
 		mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
-		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
-		put_layout_hdr_locked(lo);
 	}
 	}
 	spin_unlock(&nfsi->vfs_inode.i_lock);
 	spin_unlock(&nfsi->vfs_inode.i_lock);
 	pnfs_free_lseg_list(&tmp_list);
 	pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
 	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
 	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
 		return true;
 		return true;
 	return lo->plh_block_lgets ||
 	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 		(list_empty(&lo->plh_segs) &&
 		(list_empty(&lo->plh_segs) &&
 		 (atomic_read(&lo->plh_outstanding) > lget));
 		 (atomic_read(&lo->plh_outstanding) > lget));
@@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
 		    is_matching_lseg(lseg, iomode)) {
 		    is_matching_lseg(lseg, iomode)) {
-			ret = lseg;
+			ret = get_lseg(lseg);
 			break;
 			break;
 		}
 		}
 		if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
 		if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino,
 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_segment *lseg = NULL;
 	struct pnfs_layout_segment *lseg = NULL;
+	bool first = false;
 
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		return NULL;
 		return NULL;
@@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino,
 		dprintk("%s matches recall, use MDS\n", __func__);
 		dprintk("%s matches recall, use MDS\n", __func__);
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
-	/* Check to see if the layout for the given range already exists */
-	lseg = pnfs_find_lseg(lo, iomode);
-	if (lseg)
-		goto out_unlock;
 
 
 	/* if LAYOUTGET already failed once we don't try again */
 	/* if LAYOUTGET already failed once we don't try again */
 	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
 	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
 		goto out_unlock;
 		goto out_unlock;
 
 
+	/* Check to see if the layout for the given range already exists */
+	lseg = pnfs_find_lseg(lo, iomode);
+	if (lseg)
+		goto out_unlock;
+
 	if (pnfs_layoutgets_blocked(lo, NULL, 0))
 	if (pnfs_layoutgets_blocked(lo, NULL, 0))
 		goto out_unlock;
 		goto out_unlock;
 	atomic_inc(&lo->plh_outstanding);
 	atomic_inc(&lo->plh_outstanding);
 
 
 	get_layout_hdr(lo);
 	get_layout_hdr(lo);
-	if (list_empty(&lo->plh_segs)) {
+	if (list_empty(&lo->plh_segs))
+		first = true;
+	spin_unlock(&ino->i_lock);
+	if (first) {
 		/* The lo must be on the clp list if there is any
 		/* The lo must be on the clp list if there is any
 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
 		 */
 		 */
@@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino,
 		list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
 		list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
 		spin_unlock(&clp->cl_lock);
 		spin_unlock(&clp->cl_lock);
 	}
 	}
-	spin_unlock(&ino->i_lock);
 
 
 	lseg = send_layoutget(lo, ctx, iomode);
 	lseg = send_layoutget(lo, ctx, iomode);
-	if (!lseg) {
-		spin_lock(&ino->i_lock);
-		if (list_empty(&lo->plh_segs)) {
-			spin_lock(&clp->cl_lock);
-			list_del_init(&lo->plh_layouts);
-			spin_unlock(&clp->cl_lock);
-			clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-		}
-		spin_unlock(&ino->i_lock);
+	if (!lseg && first) {
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
 	}
 	}
 	atomic_dec(&lo->plh_outstanding);
 	atomic_dec(&lo->plh_outstanding);
 	put_layout_hdr(lo);
 	put_layout_hdr(lo);
 out:
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-		nfsi->layout->plh_flags, lseg);
+		nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
 	return lseg;
 	return lseg;
 out_unlock:
 out_unlock:
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
@@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	}
 	}
 	init_lseg(lo, lseg);
 	init_lseg(lo, lseg);
 	lseg->pls_range = res->range;
 	lseg->pls_range = res->range;
-	*lgp->lsegpp = lseg;
+	*lgp->lsegpp = get_lseg(lseg);
 	pnfs_insert_layout(lo, lseg);
 	pnfs_insert_layout(lo, lseg);
 
 
 	if (res->return_on_close) {
 	if (res->return_on_close) {
@@ -829,137 +851,97 @@ out_forget_reply:
 	goto out;
 	goto out;
 }
 }
 
 
-/*
- * Device ID cache. Currently supports one layout type per struct nfs_client.
- * Add layout type to the lookup key to expand to support multiple types.
- */
-int
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
-			 void (*free_callback)(struct pnfs_deviceid_node *))
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
+			     struct nfs_page *prev,
+			     struct nfs_page *req)
 {
 {
-	struct pnfs_deviceid_cache *c;
-
-	c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
-	if (!c)
-		return -ENOMEM;
-	spin_lock(&clp->cl_lock);
-	if (clp->cl_devid_cache != NULL) {
-		atomic_inc(&clp->cl_devid_cache->dc_ref);
-		dprintk("%s [kref [%d]]\n", __func__,
-			atomic_read(&clp->cl_devid_cache->dc_ref));
-		kfree(c);
-	} else {
-		/* kzalloc initializes hlists */
-		spin_lock_init(&c->dc_lock);
-		atomic_set(&c->dc_ref, 1);
-		c->dc_free_callback = free_callback;
-		clp->cl_devid_cache = c;
-		dprintk("%s [new]\n", __func__);
+	if (pgio->pg_count == prev->wb_bytes) {
+		/* This is first coelesce call for a series of nfs_pages */
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   prev->wb_context,
+						   IOMODE_READ);
 	}
 	}
-	spin_unlock(&clp->cl_lock);
-	return 0;
+	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
 }
 }
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
 
 
-/*
- * Called from pnfs_layoutdriver_type->free_lseg
- * last layout segment reference frees deviceid
- */
 void
 void
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-		  struct pnfs_deviceid_node *devid)
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
 {
 {
-	struct nfs4_deviceid *id = &devid->de_id;
-	struct pnfs_deviceid_node *d;
-	struct hlist_node *n;
-	long h = nfs4_deviceid_hash(id);
+	struct pnfs_layoutdriver_type *ld;
 
 
-	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
-	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
-		return;
+	ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
+}
 
 
-	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
-		if (!memcmp(&d->de_id, id, sizeof(*id))) {
-			hlist_del_rcu(&d->de_node);
-			spin_unlock(&c->dc_lock);
-			synchronize_rcu();
-			c->dc_free_callback(devid);
-			return;
-		}
-	spin_unlock(&c->dc_lock);
-	/* Why wasn't it found in  the list? */
-	BUG();
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
-
-/* Find and reference a deviceid */
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
-	struct pnfs_deviceid_node *d;
-	struct hlist_node *n;
-	long hash = nfs4_deviceid_hash(id);
-
-	dprintk("--> %s hash %ld\n", __func__, hash);
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-		if (!memcmp(&d->de_id, id, sizeof(*id))) {
-			if (!atomic_inc_not_zero(&d->de_ref)) {
-				goto fail;
-			} else {
-				rcu_read_unlock();
-				return d;
-			}
-		}
+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
+			      struct nfs_page *prev,
+			      struct nfs_page *req)
+{
+	if (pgio->pg_count == prev->wb_bytes) {
+		/* This is first coelesce call for a series of nfs_pages */
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   prev->wb_context,
+						   IOMODE_RW);
 	}
 	}
-fail:
-	rcu_read_unlock();
-	return NULL;
+	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+}
+
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld;
+
+	ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+			const struct rpc_call_ops *call_ops, int how)
+{
+	struct inode *inode = wdata->inode;
+	enum pnfs_try_status trypnfs;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	wdata->mds_ops = call_ops;
+
+	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+		inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
+	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
+		put_lseg(wdata->lseg);
+		wdata->lseg = NULL;
+	} else
+		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
 }
 }
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
 
 
 /*
 /*
- * Add a deviceid to the cache.
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ * Call the appropriate parallel I/O subsystem read function.
  */
  */
-struct pnfs_deviceid_node *
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
-{
-	struct pnfs_deviceid_node *d;
-	long hash = nfs4_deviceid_hash(&new->de_id);
-
-	dprintk("--> %s hash %ld\n", __func__, hash);
-	spin_lock(&c->dc_lock);
-	d = pnfs_find_get_deviceid(c, &new->de_id);
-	if (d) {
-		spin_unlock(&c->dc_lock);
-		dprintk("%s [discard]\n", __func__);
-		c->dc_free_callback(new);
-		return d;
-	}
-	INIT_HLIST_NODE(&new->de_node);
-	atomic_set(&new->de_ref, 1);
-	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-	spin_unlock(&c->dc_lock);
-	dprintk("%s [new]\n", __func__);
-	return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
+enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
+		       const struct rpc_call_ops *call_ops)
 {
 {
-	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+	struct inode *inode = rdata->inode;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	enum pnfs_try_status trypnfs;
 
 
-	dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
-	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-		int i;
-		/* Verify cache is empty */
-		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-			BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
-		clp->cl_devid_cache = NULL;
-		spin_unlock(&clp->cl_lock);
-		kfree(local);
+	rdata->mds_ops = call_ops;
+
+	dprintk("%s: Reading ino:%lu %u@%llu\n",
+		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+
+	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
+		put_lseg(rdata->lseg);
+		rdata->lseg = NULL;
+	} else {
+		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
 	}
 	}
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
 }
 }
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);

+ 70 - 48
fs/nfs/pnfs.h

@@ -30,6 +30,8 @@
 #ifndef FS_NFS_PNFS_H
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
 
+#include <linux/nfs_page.h>
+
 enum {
 enum {
 	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
 	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
 	NFS_LSEG_ROC,		/* roc bit received from server */
 	NFS_LSEG_ROC,		/* roc bit received from server */
@@ -43,6 +45,11 @@ struct pnfs_layout_segment {
 	struct pnfs_layout_hdr *pls_layout;
 	struct pnfs_layout_hdr *pls_layout;
 };
 };
 
 
+enum pnfs_try_status {
+	PNFS_ATTEMPTED     = 0,
+	PNFS_NOT_ATTEMPTED = 1,
+};
+
 #ifdef CONFIG_NFS_V4_1
 #ifdef CONFIG_NFS_V4_1
 
 
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type {
 	const u32 id;
 	const u32 id;
 	const char *name;
 	const char *name;
 	struct module *owner;
 	struct module *owner;
-	int (*set_layoutdriver) (struct nfs_server *);
-	int (*clear_layoutdriver) (struct nfs_server *);
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+	/* test for nfs page cache coalescing */
+	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+
+	/*
+	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+	 */
+	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
 };
 };
 
 
 struct pnfs_layout_hdr {
 struct pnfs_layout_hdr {
@@ -90,52 +105,6 @@ struct pnfs_device {
 	unsigned int  pglen;
 	unsigned int  pglen;
 };
 };
 
 
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS	5
-#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
-	unsigned char *cptr = (unsigned char *)id->data;
-	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-	u32 x = 0;
-
-	while (nbytes--) {
-		x *= 37;
-		x += *cptr++;
-	}
-	return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-
-struct pnfs_deviceid_node {
-	struct hlist_node	de_node;
-	struct nfs4_deviceid	de_id;
-	atomic_t		de_ref;
-};
-
-struct pnfs_deviceid_cache {
-	spinlock_t		dc_lock;
-	atomic_t		dc_ref;
-	void			(*dc_free_callback)(struct pnfs_deviceid_node *);
-	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-			void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-				struct pnfs_deviceid_cache *,
-				struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-				struct pnfs_deviceid_cache *,
-				struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-			      struct pnfs_deviceid_node *devid);
-
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
 
@@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 
 
 /* pnfs.c */
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
 		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+					     const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+					    const struct rpc_call_ops *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode)
 			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
 }
 
 
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+	if (lseg) {
+		atomic_inc(&lseg->pls_refcount);
+		smp_mb__after_atomic_inc();
+	}
+	return lseg;
+}
+
 /* Return true if a layout driver is being used for this mountpoint */
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
 {
@@ -193,6 +179,16 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 {
 {
 }
 }
 
 
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+	return NULL;
+}
+
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
 static inline struct pnfs_layout_segment *
 static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type)
 		   enum pnfs_iomode access_type)
@@ -200,6 +196,20 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 	return NULL;
 	return NULL;
 }
 }
 
 
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+		      const struct rpc_call_ops *call_ops)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+		       const struct rpc_call_ops *call_ops, int how)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
 static inline bool
 static inline bool
 pnfs_roc(struct inode *ino)
 pnfs_roc(struct inode *ino)
 {
 {
@@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 {
 }
 }
 
 
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+	pgio->pg_test = NULL;
+}
+
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+	pgio->pg_test = NULL;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4_1 */
 
 
 #endif /* FS_NFS_PNFS_H */
 #endif /* FS_NFS_PNFS_H */

+ 1 - 0
fs/nfs/proc.c

@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.lock		= nfs_proc_lock,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
 	.lock_check_bounds = nfs_lock_check_bounds,
 	.close_context	= nfs_close_context,
 	.close_context	= nfs_close_context,
+	.init_client	= nfs_init_client,
 };
 };

+ 81 - 46
fs/nfs/read.c

@@ -18,19 +18,20 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 
 
 #include <asm/system.h>
 #include <asm/system.h>
+#include "pnfs.h"
 
 
 #include "nfs4_fs.h"
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "internal.h"
 #include "iostat.h"
 #include "iostat.h"
 #include "fscache.h"
 #include "fscache.h"
-#include "pnfs.h"
 
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
 
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
 
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 
 
 static void nfs_readdata_release(struct nfs_read_data *rdata)
 static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
 {
+	put_lseg(rdata->lseg);
 	put_nfs_open_context(rdata->args.context);
 	put_nfs_open_context(rdata->args.context);
 	nfs_readdata_free(rdata);
 	nfs_readdata_free(rdata);
 }
 }
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		       struct page *page)
 		       struct page *page)
 {
 {
-	LIST_HEAD(one_request);
 	struct nfs_page	*new;
 	struct nfs_page	*new;
 	unsigned int len;
 	unsigned int len;
+	struct nfs_pageio_descriptor pgio;
 
 
 	len = nfs_page_length(page);
 	len = nfs_page_length(page);
 	if (len == 0)
 	if (len == 0)
 		return nfs_return_empty_page(page);
 		return nfs_return_empty_page(page);
-	pnfs_update_layout(inode, ctx, IOMODE_READ);
 	new = nfs_create_request(ctx, inode, page, 0, len);
 	new = nfs_create_request(ctx, inode, page, 0, len);
 	if (IS_ERR(new)) {
 	if (IS_ERR(new)) {
 		unlock_page(page);
 		unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_CACHE_SIZE)
 	if (len < PAGE_CACHE_SIZE)
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 
 
-	nfs_list_add_request(new, &one_request);
+	nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+	nfs_list_add_request(new, &pgio.pg_list);
+	pgio.pg_count = len;
+
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-		nfs_pagein_multi(inode, &one_request, 1, len, 0);
+		nfs_pagein_multi(&pgio);
 	else
 	else
-		nfs_pagein_one(inode, &one_request, 1, len, 0);
+		nfs_pagein_one(&pgio);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 	nfs_release_request(req);
 }
 }
 
 
-/*
- * Set up the NFS read request struct
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-		const struct rpc_call_ops *call_ops,
-		unsigned int count, unsigned int offset)
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+		      const struct rpc_call_ops *call_ops)
 {
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = data->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 	struct rpc_task *task;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_resp = &data->res,
-		.rpc_cred = req->wb_context->cred,
+		.rpc_cred = data->cred,
 	};
 	};
 	struct rpc_task_setup task_setup_data = {
 	struct rpc_task_setup task_setup_data = {
 		.task = &data->task,
 		.task = &data->task,
-		.rpc_client = NFS_CLIENT(inode),
+		.rpc_client = clnt,
 		.rpc_message = &msg,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		.flags = RPC_TASK_ASYNC | swap_flags,
 		.flags = RPC_TASK_ASYNC | swap_flags,
 	};
 	};
 
 
+	/* Set up the initial task struct. */
+	NFS_PROTO(inode)->read_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+			"offset %llu)\n",
+			data->task.tk_pid,
+			inode->i_sb->s_id,
+			(long long)NFS_FILEID(inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
+
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+		const struct rpc_call_ops *call_ops,
+		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+
 	data->req	  = req;
 	data->req	  = req;
 	data->inode	  = inode;
 	data->inode	  = inode;
-	data->cred	  = msg.rpc_cred;
+	data->cred	  = req->wb_context->cred;
+	data->lseg	  = get_lseg(lseg);
 
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
 	data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->res.eof     = 0;
 	data->res.eof     = 0;
 	nfs_fattr_init(&data->fattr);
 	nfs_fattr_init(&data->fattr);
 
 
-	/* Set up the initial task struct. */
-	NFS_PROTO(inode)->read_setup(data, &msg);
-
-	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-			data->task.tk_pid,
-			inode->i_sb->s_id,
-			(long long)NFS_FILEID(inode),
-			count,
-			(unsigned long long)data->args.offset);
+	if (data->lseg &&
+	    (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+		return 0;
 
 
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-	rpc_put_task(task);
-	return 0;
+	return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
 }
 }
 
 
 static void
 static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
  * won't see the new data until our attribute cache is updated.  This is more
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  * or less conventional NFS client behavior.
  */
  */
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 {
 {
-	struct nfs_page *req = nfs_list_entry(head->next);
+	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
 	struct page *page = req->wb_page;
 	struct nfs_read_data *data;
 	struct nfs_read_data *data;
-	size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+	size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
 	unsigned int offset;
 	unsigned int offset;
 	int requests = 0;
 	int requests = 0;
 	int ret = 0;
 	int ret = 0;
+	struct pnfs_layout_segment *lseg;
 	LIST_HEAD(list);
 	LIST_HEAD(list);
 
 
 	nfs_list_remove_request(req);
 	nfs_list_remove_request(req);
 
 
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 	do {
 		size_t len = min(nbytes,rsize);
 		size_t len = min(nbytes,rsize);
 
 
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 	} while(nbytes != 0);
 	} while(nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 	atomic_set(&req->wb_complete, requests);
 
 
+	BUG_ON(desc->pg_lseg != NULL);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
 	ClearPageError(page);
 	ClearPageError(page);
 	offset = 0;
 	offset = 0;
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 	do {
 		int ret2;
 		int ret2;
 
 
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 		if (nbytes < rsize)
 		if (nbytes < rsize)
 			rsize = nbytes;
 			rsize = nbytes;
 		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
 		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-				  rsize, offset);
+					 rsize, offset, lseg);
 		if (ret == 0)
 		if (ret == 0)
 			ret = ret2;
 			ret = ret2;
 		offset += rsize;
 		offset += rsize;
 		nbytes -= rsize;
 		nbytes -= rsize;
 	} while (nbytes != 0);
 	} while (nbytes != 0);
+	put_lseg(lseg);
+	desc->pg_lseg = NULL;
 
 
 	return ret;
 	return ret;
 
 
@@ -300,16 +325,21 @@ out_bad:
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
 
 
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 {
 {
 	struct nfs_page		*req;
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct page		**pages;
 	struct nfs_read_data	*data;
 	struct nfs_read_data	*data;
+	struct list_head *head = &desc->pg_list;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 	int ret = -ENOMEM;
 	int ret = -ENOMEM;
 
 
-	data = nfs_readdata_alloc(npages);
-	if (!data)
-		goto out_bad;
+	data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
+						     desc->pg_count));
+	if (!data) {
+		nfs_async_read_error(head);
+		goto out;
+	}
 
 
 	pages = data->pagevec;
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 	while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 		*pages++ = req->wb_page;
 		*pages++ = req->wb_page;
 	}
 	}
 	req = nfs_list_entry(data->pages.next);
 	req = nfs_list_entry(data->pages.next);
+	if ((!lseg) && list_is_singular(&data->pages))
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
 
 
-	return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-out_bad:
-	nfs_async_read_error(head);
+	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
+				0, lseg);
+out:
+	put_lseg(lseg);
+	desc->pg_lseg = NULL;
 	return ret;
 	return ret;
 }
 }
 
 
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 		return;
 		return;
 
 
 	/* Yes, so retry the read at the end of the data */
 	/* Yes, so retry the read at the end of the data */
+	data->mds_offset += resp->count;
 	argp->offset += resp->count;
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
 	argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 		goto read_complete; /* all pages were read */
 
 
-	pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+	pnfs_pageio_init_read(&pgio, inode);
 	if (rsize < PAGE_CACHE_SIZE)
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
 	else

+ 77 - 207
fs/nfs/super.c

@@ -1008,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
 	return 1;
 	return 1;
 }
 }
 
 
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+	kfree(*option);
+	*option = match_strdup(args);
+	return !option;
+}
+
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+	int rc;
+	char *string;
+
+	string = match_strdup(args);
+	if (string == NULL)
+		return -ENOMEM;
+	rc = strict_strtoul(string, 10, option);
+	kfree(string);
+
+	return rc;
+}
+
 /*
 /*
  * Error-check and convert a string of mount options from user space into
  * Error-check and convert a string of mount options from user space into
  * a data structure.  The whole mount string is processed; bad options are
  * a data structure.  The whole mount string is processed; bad options are
@@ -1156,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
 		 * options that take numeric values
 		 * options that take numeric values
 		 */
 		 */
 		case Opt_port:
 		case Opt_port:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option > USHRT_MAX)
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->nfs_server.port = option;
 			mnt->nfs_server.port = option;
 			break;
 			break;
 		case Opt_rsize:
 		case Opt_rsize:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->rsize = option;
 			mnt->rsize = option;
 			break;
 			break;
 		case Opt_wsize:
 		case Opt_wsize:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->wsize = option;
 			mnt->wsize = option;
 			break;
 			break;
 		case Opt_bsize:
 		case Opt_bsize:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->bsize = option;
 			mnt->bsize = option;
 			break;
 			break;
 		case Opt_timeo:
 		case Opt_timeo:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option == 0)
+			if (nfs_get_option_ul(args, &option) || option == 0)
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->timeo = option;
 			mnt->timeo = option;
 			break;
 			break;
 		case Opt_retrans:
 		case Opt_retrans:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option == 0)
+			if (nfs_get_option_ul(args, &option) || option == 0)
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->retrans = option;
 			mnt->retrans = option;
 			break;
 			break;
 		case Opt_acregmin:
 		case Opt_acregmin:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->acregmin = option;
 			mnt->acregmin = option;
 			break;
 			break;
 		case Opt_acregmax:
 		case Opt_acregmax:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->acregmax = option;
 			mnt->acregmax = option;
 			break;
 			break;
 		case Opt_acdirmin:
 		case Opt_acdirmin:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->acdirmin = option;
 			mnt->acdirmin = option;
 			break;
 			break;
 		case Opt_acdirmax:
 		case Opt_acdirmax:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->acdirmax = option;
 			mnt->acdirmax = option;
 			break;
 			break;
 		case Opt_actimeo:
 		case Opt_actimeo:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->acregmin = mnt->acregmax =
 			mnt->acregmin = mnt->acregmax =
 			mnt->acdirmin = mnt->acdirmax = option;
 			mnt->acdirmin = mnt->acdirmax = option;
 			break;
 			break;
 		case Opt_namelen:
 		case Opt_namelen:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->namlen = option;
 			mnt->namlen = option;
 			break;
 			break;
 		case Opt_mountport:
 		case Opt_mountport:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option > USHRT_MAX)
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->mount_server.port = option;
 			mnt->mount_server.port = option;
 			break;
 			break;
 		case Opt_mountvers:
 		case Opt_mountvers:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 ||
+			if (nfs_get_option_ul(args, &option) ||
 			    option < NFS_MNT_VERSION ||
 			    option < NFS_MNT_VERSION ||
 			    option > NFS_MNT3_VERSION)
 			    option > NFS_MNT3_VERSION)
 				goto out_invalid_value;
 				goto out_invalid_value;
 			mnt->mount_server.version = option;
 			mnt->mount_server.version = option;
 			break;
 			break;
 		case Opt_nfsvers:
 		case Opt_nfsvers:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			switch (option) {
 			switch (option) {
 			case NFS2_VERSION:
 			case NFS2_VERSION:
@@ -1324,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
 			}
 			}
 			break;
 			break;
 		case Opt_minorversion:
 		case Opt_minorversion:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 				goto out_invalid_value;
 			if (option > NFS4_MAX_MINOR_VERSION)
 			if (option > NFS4_MAX_MINOR_VERSION)
 				goto out_invalid_value;
 				goto out_invalid_value;
@@ -1365,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
 			case Opt_xprt_udp:
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-				kfree(string);
 				break;
 				break;
 			case Opt_xprt_tcp6:
 			case Opt_xprt_tcp6:
 				protofamily = AF_INET6;
 				protofamily = AF_INET6;
 			case Opt_xprt_tcp:
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-				kfree(string);
 				break;
 				break;
 			case Opt_xprt_rdma:
 			case Opt_xprt_rdma:
 				/* vector side protocols to TCP */
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
 				xprt_load_transport(string);
 				xprt_load_transport(string);
-				kfree(string);
 				break;
 				break;
 			default:
 			default:
 				dfprintk(MOUNT, "NFS:   unrecognized "
 				dfprintk(MOUNT, "NFS:   unrecognized "
@@ -1387,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
 				kfree(string);
 				kfree(string);
 				return 0;
 				return 0;
 			}
 			}
+			kfree(string);
 			break;
 			break;
 		case Opt_mountproto:
 		case Opt_mountproto:
 			string = match_strdup(args);
 			string = match_strdup(args);
@@ -1429,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_invalid_address;
 				goto out_invalid_address;
 			break;
 			break;
 		case Opt_clientaddr:
 		case Opt_clientaddr:
-			string = match_strdup(args);
-			if (string == NULL)
+			if (nfs_get_option_str(args, &mnt->client_address))
 				goto out_nomem;
 				goto out_nomem;
-			kfree(mnt->client_address);
-			mnt->client_address = string;
 			break;
 			break;
 		case Opt_mounthost:
 		case Opt_mounthost:
-			string = match_strdup(args);
-			if (string == NULL)
+			if (nfs_get_option_str(args,
+					       &mnt->mount_server.hostname))
 				goto out_nomem;
 				goto out_nomem;
-			kfree(mnt->mount_server.hostname);
-			mnt->mount_server.hostname = string;
 			break;
 			break;
 		case Opt_mountaddr:
 		case Opt_mountaddr:
 			string = match_strdup(args);
 			string = match_strdup(args);
@@ -1480,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
 			};
 			};
 			break;
 			break;
 		case Opt_fscache_uniq:
 		case Opt_fscache_uniq:
-			string = match_strdup(args);
-			if (string == NULL)
+			if (nfs_get_option_str(args, &mnt->fscache_uniq))
 				goto out_nomem;
 				goto out_nomem;
-			kfree(mnt->fscache_uniq);
-			mnt->fscache_uniq = string;
 			mnt->options |= NFS_OPTION_FSCACHE;
 			mnt->options |= NFS_OPTION_FSCACHE;
 			break;
 			break;
 		case Opt_local_lock:
 		case Opt_local_lock:
@@ -1694,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	return nfs_walk_authlist(args, &request);
 	return nfs_walk_authlist(args, &request);
 }
 }
 
 
-static int nfs_parse_simple_hostname(const char *dev_name,
-				     char **hostname, size_t maxnamlen,
-				     char **export_path, size_t maxpathlen)
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+			     char **hostname, size_t maxnamlen,
+			     char **export_path, size_t maxpathlen)
 {
 {
 	size_t len;
 	size_t len;
-	char *colon, *comma;
+	char *end;
 
 
-	colon = strchr(dev_name, ':');
-	if (colon == NULL)
-		goto out_bad_devname;
-
-	len = colon - dev_name;
-	if (len > maxnamlen)
-		goto out_hostname;
-
-	/* N.B. caller will free nfs_server.hostname in all cases */
-	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
-	if (!*hostname)
-		goto out_nomem;
-
-	/* kill possible hostname list: not supported */
-	comma = strchr(*hostname, ',');
-	if (comma != NULL) {
-		if (comma == *hostname)
+	/* Is the host name protected with square brakcets? */
+	if (*dev_name == '[') {
+		end = strchr(++dev_name, ']');
+		if (end == NULL || end[1] != ':')
 			goto out_bad_devname;
 			goto out_bad_devname;
-		*comma = '\0';
-	}
-
-	colon++;
-	len = strlen(colon);
-	if (len > maxpathlen)
-		goto out_path;
-	*export_path = kstrndup(colon, len, GFP_KERNEL);
-	if (!*export_path)
-		goto out_nomem;
-
-	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
-	return 0;
-
-out_bad_devname:
-	dfprintk(MOUNT, "NFS: device name not in host:path format\n");
-	return -EINVAL;
 
 
-out_nomem:
-	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
-	return -ENOMEM;
-
-out_hostname:
-	dfprintk(MOUNT, "NFS: server hostname too long\n");
-	return -ENAMETOOLONG;
-
-out_path:
-	dfprintk(MOUNT, "NFS: export pathname too long\n");
-	return -ENAMETOOLONG;
-}
-
-/*
- * Hostname has square brackets around it because it contains one or
- * more colons.  We look for the first closing square bracket, and a
- * colon must follow it.
- */
-static int nfs_parse_protected_hostname(const char *dev_name,
-					char **hostname, size_t maxnamlen,
-					char **export_path, size_t maxpathlen)
-{
-	size_t len;
-	char *start, *end;
+		len = end - dev_name;
+		end++;
+	} else {
+		char *comma;
 
 
-	start = (char *)(dev_name + 1);
+		end = strchr(dev_name, ':');
+		if (end == NULL)
+			goto out_bad_devname;
+		len = end - dev_name;
 
 
-	end = strchr(start, ']');
-	if (end == NULL)
-		goto out_bad_devname;
-	if (*(end + 1) != ':')
-		goto out_bad_devname;
+		/* kill possible hostname list: not supported */
+		comma = strchr(dev_name, ',');
+		if (comma != NULL && comma < end)
+			*comma = 0;
+	}
 
 
-	len = end - start;
 	if (len > maxnamlen)
 	if (len > maxnamlen)
 		goto out_hostname;
 		goto out_hostname;
 
 
 	/* N.B. caller will free nfs_server.hostname in all cases */
 	/* N.B. caller will free nfs_server.hostname in all cases */
-	*hostname = kstrndup(start, len, GFP_KERNEL);
+	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
 	if (*hostname == NULL)
 	if (*hostname == NULL)
 		goto out_nomem;
 		goto out_nomem;
-
-	end += 2;
-	len = strlen(end);
+	len = strlen(++end);
 	if (len > maxpathlen)
 	if (len > maxpathlen)
 		goto out_path;
 		goto out_path;
 	*export_path = kstrndup(end, len, GFP_KERNEL);
 	*export_path = kstrndup(end, len, GFP_KERNEL);
 	if (!*export_path)
 	if (!*export_path)
 		goto out_nomem;
 		goto out_nomem;
 
 
+	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
 	return 0;
 	return 0;
 
 
 out_bad_devname:
 out_bad_devname:
@@ -1806,29 +1699,6 @@ out_path:
 	return -ENAMETOOLONG;
 	return -ENAMETOOLONG;
 }
 }
 
 
-/*
- * Split "dev_name" into "hostname:export_path".
- *
- * The leftmost colon demarks the split between the server's hostname
- * and the export path.  If the hostname starts with a left square
- * bracket, then it may contain colons.
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
-			     char **hostname, size_t maxnamlen,
-			     char **export_path, size_t maxpathlen)
-{
-	if (*dev_name == '[')
-		return nfs_parse_protected_hostname(dev_name,
-						    hostname, maxnamlen,
-						    export_path, maxpathlen);
-
-	return nfs_parse_simple_hostname(dev_name,
-					 hostname, maxnamlen,
-					 export_path, maxpathlen);
-}
-
 /*
 /*
  * Validate the NFS2/NFS3 mount data
  * Validate the NFS2/NFS3 mount data
  * - fills in the mount root filehandle
  * - fills in the mount root filehandle

+ 95 - 58
fs/nfs/write.c

@@ -28,6 +28,7 @@
 #include "iostat.h"
 #include "iostat.h"
 #include "nfs4_fs.h"
 #include "nfs4_fs.h"
 #include "fscache.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
 
@@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
 
 
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
 {
+	put_lseg(wdata->lseg);
 	put_nfs_open_context(wdata->args.context);
 	put_nfs_open_context(wdata->args.context);
 	nfs_writedata_free(wdata);
 	nfs_writedata_free(wdata);
 }
 }
@@ -781,25 +783,21 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 	return RPC_PRIORITY_NORMAL;
 }
 }
 
 
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static int nfs_write_rpcsetup(struct nfs_page *req,
-		struct nfs_write_data *data,
-		const struct rpc_call_ops *call_ops,
-		unsigned int count, unsigned int offset,
-		int how)
+int nfs_initiate_write(struct nfs_write_data *data,
+		       struct rpc_clnt *clnt,
+		       const struct rpc_call_ops *call_ops,
+		       int how)
 {
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = data->inode;
 	int priority = flush_task_priority(how);
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_resp = &data->res,
-		.rpc_cred = req->wb_context->cred,
+		.rpc_cred = data->cred,
 	};
 	};
 	struct rpc_task_setup task_setup_data = {
 	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
+		.rpc_client = clnt,
 		.task = &data->task,
 		.task = &data->task,
 		.rpc_message = &msg,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_ops = call_ops,
@@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	};
 	};
 	int ret = 0;
 	int ret = 0;
 
 
+	/* Set up the initial task struct.  */
+	NFS_PROTO(inode)->write_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated write call "
+		"(req %s/%lld, %u bytes @ offset %llu)\n",
+		data->task.tk_pid,
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode),
+		data->args.count,
+		(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+		struct nfs_write_data *data,
+		const struct rpc_call_ops *call_ops,
+		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg,
+		int how)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+
 	/* Set up the RPC argument and reply structs
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 	 * NB: take care not to mess about with data->commit et al. */
 
 
 	data->req = req;
 	data->req = req;
 	data->inode = inode = req->wb_context->path.dentry->d_inode;
 	data->inode = inode = req->wb_context->path.dentry->d_inode;
-	data->cred = msg.rpc_cred;
+	data->cred = req->wb_context->cred;
+	data->lseg = get_lseg(lseg);
 
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
 	data->args.offset = req_offset(req) + offset;
@@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->res.verf    = &data->verf;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
 	nfs_fattr_init(&data->fattr);
 
 
-	/* Set up the initial task struct.  */
-	NFS_PROTO(inode)->write_setup(data, &msg);
-
-	dprintk("NFS: %5u initiated write call "
-		"(req %s/%lld, %u bytes @ offset %llu)\n",
-		data->task.tk_pid,
-		inode->i_sb->s_id,
-		(long long)NFS_FILEID(inode),
-		count,
-		(unsigned long long)data->args.offset);
+	if (data->lseg &&
+	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+		return 0;
 
 
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		ret = PTR_ERR(task);
-		goto out;
-	}
-	if (how & FLUSH_SYNC) {
-		ret = rpc_wait_for_completion_task(task);
-		if (ret == 0)
-			ret = task->tk_status;
-	}
-	rpc_put_task(task);
-out:
-	return ret;
+	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
 }
 }
 
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
  * Generate multiple small requests to write out a single
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  * contiguous dirty area on one page.
  */
  */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 {
 {
-	struct nfs_page *req = nfs_list_entry(head->next);
+	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
 	struct page *page = req->wb_page;
 	struct nfs_write_data *data;
 	struct nfs_write_data *data;
-	size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+	size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
 	unsigned int offset;
 	unsigned int offset;
 	int requests = 0;
 	int requests = 0;
 	int ret = 0;
 	int ret = 0;
+	struct pnfs_layout_segment *lseg;
 	LIST_HEAD(list);
 	LIST_HEAD(list);
 
 
 	nfs_list_remove_request(req);
 	nfs_list_remove_request(req);
 
 
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 	do {
 		size_t len = min(nbytes, wsize);
 		size_t len = min(nbytes, wsize);
 
 
@@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 	} while (nbytes != 0);
 	} while (nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 	atomic_set(&req->wb_complete, requests);
 
 
+	BUG_ON(desc->pg_lseg);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
 	ClearPageError(page);
 	ClearPageError(page);
 	offset = 0;
 	offset = 0;
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 	do {
 		int ret2;
 		int ret2;
 
 
@@ -919,13 +941,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 		if (nbytes < wsize)
 		if (nbytes < wsize)
 			wsize = nbytes;
 			wsize = nbytes;
 		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
 		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-				   wsize, offset, how);
+					  wsize, offset, lseg, desc->pg_ioflags);
 		if (ret == 0)
 		if (ret == 0)
 			ret = ret2;
 			ret = ret2;
 		offset += wsize;
 		offset += wsize;
 		nbytes -= wsize;
 		nbytes -= wsize;
 	} while (nbytes != 0);
 	} while (nbytes != 0);
 
 
+	put_lseg(lseg);
+	desc->pg_lseg = NULL;
 	return ret;
 	return ret;
 
 
 out_bad:
 out_bad:
@@ -946,16 +970,26 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  * that has been written but not committed.
  */
  */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 {
 {
 	struct nfs_page		*req;
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct page		**pages;
 	struct nfs_write_data	*data;
 	struct nfs_write_data	*data;
+	struct list_head *head = &desc->pg_list;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	int ret;
 
 
-	data = nfs_writedata_alloc(npages);
-	if (!data)
-		goto out_bad;
-
+	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
+						      desc->pg_count));
+	if (!data) {
+		while (!list_empty(head)) {
+			req = nfs_list_entry(head->next);
+			nfs_list_remove_request(req);
+			nfs_redirty_request(req);
+		}
+		ret = -ENOMEM;
+		goto out;
+	}
 	pages = data->pagevec;
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		req = nfs_list_entry(head->next);
@@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 		*pages++ = req->wb_page;
 		*pages++ = req->wb_page;
 	}
 	}
 	req = nfs_list_entry(data->pages.next);
 	req = nfs_list_entry(data->pages.next);
+	if ((!lseg) && list_is_singular(&data->pages))
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
 
 
 	/* Set up the argument struct */
 	/* Set up the argument struct */
-	return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
- out_bad:
-	while (!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_redirty_request(req);
-	}
-	return -ENOMEM;
+	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
+out:
+	put_lseg(lseg); /* Cleans any gotten in ->pg_test */
+	desc->pg_lseg = NULL;
+	return ret;
 }
 }
 
 
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
 {
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 
 
+	pnfs_pageio_init_write(pgio, inode);
+
 	if (wsize < PAGE_CACHE_SIZE)
 	if (wsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
 	else
 	else
@@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
 /*
 /*
  * This function is called when the WRITE call is complete.
  * This function is called when the WRITE call is complete.
  */
  */
-int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
 	struct nfs_writeres	*resp = &data->res;
@@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 	 */
 	 */
 	status = NFS_PROTO(data->inode)->write_done(task, data);
 	status = NFS_PROTO(data->inode)->write_done(task, data);
 	if (status != 0)
 	if (status != 0)
-		return status;
+		return;
 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		 */
 		 */
 		static unsigned long    complain;
 		static unsigned long    complain;
 
 
+		/* Note this will print the MDS for a DS write */
 		if (time_before(complain, jiffies)) {
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS:       faulty NFS server %s:"
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
 				" (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 			/* Was this an NFSv2 write or an NFSv3 stable write? */
 			/* Was this an NFSv2 write or an NFSv3 stable write? */
 			if (resp->verf->committed != NFS_UNSTABLE) {
 			if (resp->verf->committed != NFS_UNSTABLE) {
 				/* Resend from where the server left off */
 				/* Resend from where the server left off */
+				data->mds_offset += resp->count;
 				argp->offset += resp->count;
 				argp->offset += resp->count;
 				argp->pgbase += resp->count;
 				argp->pgbase += resp->count;
 				argp->count -= resp->count;
 				argp->count -= resp->count;
@@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 				argp->stable = NFS_FILE_SYNC;
 				argp->stable = NFS_FILE_SYNC;
 			}
 			}
 			nfs_restart_rpc(task, server->nfs_client);
 			nfs_restart_rpc(task, server->nfs_client);
-			return -EAGAIN;
+			return;
 		}
 		}
 		if (time_before(complain, jiffies)) {
 		if (time_before(complain, jiffies)) {
 			printk(KERN_WARNING
 			printk(KERN_WARNING
@@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		/* Can't do anything about it except throw an error. */
 		/* Can't do anything about it except throw an error. */
 		task->tk_status = -EIO;
 		task->tk_status = -EIO;
 	}
 	}
-	return 0;
+	return;
 }
 }
 
 
 
 

+ 1 - 1
include/linux/nfs_fs.h

@@ -501,7 +501,7 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 
 
 /*
 /*
  * Try to write back everything synchronously (but check the
  * Try to write back everything synchronously (but check the

+ 3 - 1
include/linux/nfs_fs_sb.h

@@ -30,6 +30,8 @@ struct nfs_client {
 #define NFS_CS_CALLBACK		1		/* - callback started */
 #define NFS_CS_CALLBACK		1		/* - callback started */
 #define NFS_CS_IDMAP		2		/* - idmap started */
 #define NFS_CS_IDMAP		2		/* - idmap started */
 #define NFS_CS_RENEWD		3		/* - renewd started */
 #define NFS_CS_RENEWD		3		/* - renewd started */
+#define NFS_CS_STOP_RENEW	4		/* no more state to renew */
+#define NFS_CS_CHECK_LEASE_TIME	5		/* need to check lease time */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
 	char *			cl_hostname;	/* hostname of server */
@@ -75,7 +77,6 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_layouts;
 	struct list_head	cl_layouts;
-	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4 */
 #endif /* CONFIG_NFS_V4 */
 
 
 #ifdef CONFIG_NFS_FSCACHE
 #ifdef CONFIG_NFS_FSCACHE
@@ -176,6 +177,7 @@ struct nfs_server {
 #define NFS_CAP_CTIME		(1U << 12)
 #define NFS_CAP_CTIME		(1U << 12)
 #define NFS_CAP_MTIME		(1U << 13)
 #define NFS_CAP_MTIME		(1U << 13)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
+#define NFS_CAP_UIDGID_NOMAP	(1U << 15)
 
 
 
 
 /* maximum number of slots to use */
 /* maximum number of slots to use */

+ 5 - 4
include/linux/nfs_idmap.h

@@ -65,6 +65,7 @@ struct idmap_msg {
 
 
 /* Forward declaration to make this header independent of others */
 /* Forward declaration to make this header independent of others */
 struct nfs_client;
 struct nfs_client;
+struct nfs_server;
 
 
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 
 
@@ -96,10 +97,10 @@ void nfs_idmap_delete(struct nfs_client *);
 
 
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 
 
-int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *);
-int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *);
-int nfs_map_uid_to_name(struct nfs_client *, __u32, char *, size_t);
-int nfs_map_gid_to_group(struct nfs_client *, __u32, char *, size_t);
+int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, __u32 *);
+int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, __u32 *);
+int nfs_map_uid_to_name(const struct nfs_server *, __u32, char *, size_t);
+int nfs_map_gid_to_group(const struct nfs_server *, __u32, char *, size_t);
 
 
 extern unsigned int nfs_idmap_cache_timeout;
 extern unsigned int nfs_idmap_cache_timeout;
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */

+ 2 - 0
include/linux/nfs_iostat.h

@@ -113,6 +113,8 @@ enum nfs_stat_eventcounters {
 	NFSIOS_SHORTREAD,
 	NFSIOS_SHORTREAD,
 	NFSIOS_SHORTWRITE,
 	NFSIOS_SHORTWRITE,
 	NFSIOS_DELAY,
 	NFSIOS_DELAY,
+	NFSIOS_PNFS_READ,
+	NFSIOS_PNFS_WRITE,
 	__NFSIOS_COUNTSMAX,
 	__NFSIOS_COUNTSMAX,
 };
 };
 
 

+ 4 - 2
include/linux/nfs_page.h

@@ -59,9 +59,11 @@ struct nfs_pageio_descriptor {
 	unsigned int		pg_base;
 	unsigned int		pg_base;
 
 
 	struct inode		*pg_inode;
 	struct inode		*pg_inode;
-	int			(*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
+	int			(*pg_doio)(struct nfs_pageio_descriptor *);
 	int 			pg_ioflags;
 	int 			pg_ioflags;
 	int			pg_error;
 	int			pg_error;
+	struct pnfs_layout_segment *pg_lseg;
+	int			(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 };
 };
 
 
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
@@ -79,7 +81,7 @@ extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
 			  pgoff_t idx_start, unsigned int npages, int tag);
 			  pgoff_t idx_start, unsigned int npages, int tag);
 extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     struct inode *inode,
 			     struct inode *inode,
-			     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+			     int (*doio)(struct nfs_pageio_descriptor *desc),
 			     size_t bsize,
 			     size_t bsize,
 			     int how);
 			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,

+ 14 - 2
include/linux/nfs_xdr.h

@@ -1016,9 +1016,12 @@ struct nfs_read_data {
 	unsigned int		npages;	/* Max length of pagevec */
 	unsigned int		npages;	/* Max length of pagevec */
 	struct nfs_readargs args;
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 	struct nfs_readres  res;
-#ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 	unsigned long		timestamp;	/* For lease renewal */
-#endif
+	struct pnfs_layout_segment *lseg;
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+	const struct rpc_call_ops *mds_ops;
+	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
+	__u64			mds_offset;
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 };
 
 
@@ -1035,13 +1038,20 @@ struct nfs_write_data {
 	unsigned int		npages;		/* Max length of pagevec */
 	unsigned int		npages;		/* Max length of pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 	struct nfs_writeres	res;		/* result struct */
+	struct pnfs_layout_segment *lseg;
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+	const struct rpc_call_ops *mds_ops;
+	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
 #ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
 #endif
+	__u64			mds_offset;	/* Filelayout dense stripe */
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 };
 
 
 struct nfs_access_entry;
 struct nfs_access_entry;
+struct nfs_client;
+struct rpc_timeout;
 
 
 /*
 /*
  * RPC procedure vector for NFSv2/NFSv3 demuxing
  * RPC procedure vector for NFSv2/NFSv3 demuxing
@@ -1106,6 +1116,8 @@ struct nfs_rpc_ops {
 				struct nfs_open_context *ctx,
 				struct nfs_open_context *ctx,
 				int open_flags,
 				int open_flags,
 				struct iattr *iattr);
 				struct iattr *iattr);
+	int	(*init_client) (struct nfs_client *, const struct rpc_timeout *,
+				const char *, rpc_authflavor_t, int);
 };
 };
 
 
 /*
 /*

+ 1 - 0
include/linux/sunrpc/clnt.h

@@ -129,6 +129,7 @@ struct rpc_create_args {
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 				struct rpc_program *, u32);
 				struct rpc_program *, u32);
+void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);

+ 1 - 2
include/linux/sunrpc/xprt.h

@@ -12,7 +12,6 @@
 #include <linux/uio.h>
 #include <linux/uio.h>
 #include <linux/socket.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/in.h>
-#include <linux/kref.h>
 #include <linux/ktime.h>
 #include <linux/ktime.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/xdr.h>
@@ -146,7 +145,7 @@ enum xprt_transports {
 };
 };
 
 
 struct rpc_xprt {
 struct rpc_xprt {
-	struct kref		kref;		/* Reference count */
+	atomic_t		count;		/* Reference count */
 	struct rpc_xprt_ops *	ops;		/* transport methods */
 	struct rpc_xprt_ops *	ops;		/* transport methods */
 
 
 	const struct rpc_timeout *timeout;	/* timeout parms */
 	const struct rpc_timeout *timeout;	/* timeout parms */

+ 1 - 1
net/sunrpc/auth_gss/auth_gss.c

@@ -417,7 +417,7 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 		gss_msg->msg.len += len;
 		gss_msg->msg.len += len;
 	}
 	}
 	if (mech->gm_upcall_enctypes) {
 	if (mech->gm_upcall_enctypes) {
-		len = sprintf(p, mech->gm_upcall_enctypes);
+		len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes);
 		p += len;
 		p += len;
 		gss_msg->msg.len += len;
 		gss_msg->msg.len += len;
 	}
 	}

+ 1 - 1
net/sunrpc/auth_gss/gss_krb5_mech.c

@@ -750,7 +750,7 @@ static struct gss_api_mech gss_kerberos_mech = {
 	.gm_ops		= &gss_kerberos_ops,
 	.gm_ops		= &gss_kerberos_ops,
 	.gm_pf_num	= ARRAY_SIZE(gss_kerberos_pfs),
 	.gm_pf_num	= ARRAY_SIZE(gss_kerberos_pfs),
 	.gm_pfs		= gss_kerberos_pfs,
 	.gm_pfs		= gss_kerberos_pfs,
-	.gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ",
+	.gm_upcall_enctypes = "18,17,16,23,3,1,2",
 };
 };
 
 
 static int __init init_kerberos_module(void)
 static int __init init_kerberos_module(void)

+ 11 - 7
net/sunrpc/clnt.c

@@ -436,7 +436,9 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
 		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
 		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
 			rovr->tk_flags |= RPC_TASK_KILLED;
 			rovr->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(rovr, -EIO);
 			rpc_exit(rovr, -EIO);
-			rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr);
+			if (RPC_IS_QUEUED(rovr))
+				rpc_wake_up_queued_task(rovr->tk_waitqueue,
+							rovr);
 		}
 		}
 	}
 	}
 	spin_unlock(&clnt->cl_lock);
 	spin_unlock(&clnt->cl_lock);
@@ -597,6 +599,14 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 	}
 	}
 }
 }
 
 
+void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+	rpc_task_release_client(task);
+	rpc_task_set_client(task, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_task_reset_client);
+
+
 static void
 static void
 rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
 rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
 {
 {
@@ -636,12 +646,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 	rpc_task_set_client(task, task_setup_data->rpc_client);
 	rpc_task_set_client(task, task_setup_data->rpc_client);
 	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
 	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
 
 
-	if (task->tk_status != 0) {
-		int ret = task->tk_status;
-		rpc_put_task(task);
-		return ERR_PTR(ret);
-	}
-
 	if (task->tk_action == NULL)
 	if (task->tk_action == NULL)
 		rpc_call_start(task);
 		rpc_call_start(task);
 
 

+ 7 - 22
net/sunrpc/sched.c

@@ -299,15 +299,8 @@ static void rpc_make_runnable(struct rpc_task *task)
 	if (rpc_test_and_set_running(task))
 	if (rpc_test_and_set_running(task))
 		return;
 		return;
 	if (RPC_IS_ASYNC(task)) {
 	if (RPC_IS_ASYNC(task)) {
-		int status;
-
 		INIT_WORK(&task->u.tk_work, rpc_async_schedule);
 		INIT_WORK(&task->u.tk_work, rpc_async_schedule);
-		status = queue_work(rpciod_workqueue, &task->u.tk_work);
-		if (status < 0) {
-			printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
-			task->tk_status = status;
-			return;
-		}
+		queue_work(rpciod_workqueue, &task->u.tk_work);
 	} else
 	} else
 		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
 }
@@ -637,14 +630,12 @@ static void __rpc_execute(struct rpc_task *task)
 			save_callback = task->tk_callback;
 			save_callback = task->tk_callback;
 			task->tk_callback = NULL;
 			task->tk_callback = NULL;
 			save_callback(task);
 			save_callback(task);
-		}
-
-		/*
-		 * Perform the next FSM step.
-		 * tk_action may be NULL when the task has been killed
-		 * by someone else.
-		 */
-		if (!RPC_IS_QUEUED(task)) {
+		} else {
+			/*
+			 * Perform the next FSM step.
+			 * tk_action may be NULL when the task has been killed
+			 * by someone else.
+			 */
 			if (task->tk_action == NULL)
 			if (task->tk_action == NULL)
 				break;
 				break;
 			task->tk_action(task);
 			task->tk_action(task);
@@ -843,12 +834,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
 	}
 	}
 
 
 	rpc_init_task(task, setup_data);
 	rpc_init_task(task, setup_data);
-	if (task->tk_status < 0) {
-		int err = task->tk_status;
-		rpc_put_task(task);
-		return ERR_PTR(err);
-	}
-
 	task->tk_flags |= flags;
 	task->tk_flags |= flags;
 	dprintk("RPC:       allocated task %p\n", task);
 	dprintk("RPC:       allocated task %p\n", task);
 	return task;
 	return task;

+ 12 - 13
net/sunrpc/xprt.c

@@ -202,10 +202,9 @@ int xprt_reserve_xprt(struct rpc_task *task)
 		goto out_sleep;
 		goto out_sleep;
 	}
 	}
 	xprt->snd_task = task;
 	xprt->snd_task = task;
-	if (req) {
-		req->rq_bytes_sent = 0;
-		req->rq_ntrans++;
-	}
+	req->rq_bytes_sent = 0;
+	req->rq_ntrans++;
+
 	return 1;
 	return 1;
 
 
 out_sleep:
 out_sleep:
@@ -213,7 +212,7 @@ out_sleep:
 			task->tk_pid, xprt);
 			task->tk_pid, xprt);
 	task->tk_timeout = 0;
 	task->tk_timeout = 0;
 	task->tk_status = -EAGAIN;
 	task->tk_status = -EAGAIN;
-	if (req && req->rq_ntrans)
+	if (req->rq_ntrans)
 		rpc_sleep_on(&xprt->resend, task, NULL);
 		rpc_sleep_on(&xprt->resend, task, NULL);
 	else
 	else
 		rpc_sleep_on(&xprt->sending, task, NULL);
 		rpc_sleep_on(&xprt->sending, task, NULL);
@@ -965,7 +964,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
 	xprt = kzalloc(size, GFP_KERNEL);
 	xprt = kzalloc(size, GFP_KERNEL);
 	if (xprt == NULL)
 	if (xprt == NULL)
 		goto out;
 		goto out;
-	kref_init(&xprt->kref);
+	atomic_set(&xprt->count, 1);
 
 
 	xprt->max_reqs = max_req;
 	xprt->max_reqs = max_req;
 	xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
 	xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
@@ -1145,13 +1144,11 @@ found:
 
 
 /**
 /**
  * xprt_destroy - destroy an RPC transport, killing off all requests.
  * xprt_destroy - destroy an RPC transport, killing off all requests.
- * @kref: kref for the transport to destroy
+ * @xprt: transport to destroy
  *
  *
  */
  */
-static void xprt_destroy(struct kref *kref)
+static void xprt_destroy(struct rpc_xprt *xprt)
 {
 {
-	struct rpc_xprt *xprt = container_of(kref, struct rpc_xprt, kref);
-
 	dprintk("RPC:       destroying transport %p\n", xprt);
 	dprintk("RPC:       destroying transport %p\n", xprt);
 	xprt->shutdown = 1;
 	xprt->shutdown = 1;
 	del_timer_sync(&xprt->timer);
 	del_timer_sync(&xprt->timer);
@@ -1175,7 +1172,8 @@ static void xprt_destroy(struct kref *kref)
  */
  */
 void xprt_put(struct rpc_xprt *xprt)
 void xprt_put(struct rpc_xprt *xprt)
 {
 {
-	kref_put(&xprt->kref, xprt_destroy);
+	if (atomic_dec_and_test(&xprt->count))
+		xprt_destroy(xprt);
 }
 }
 
 
 /**
 /**
@@ -1185,6 +1183,7 @@ void xprt_put(struct rpc_xprt *xprt)
  */
  */
 struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
 struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
 {
 {
-	kref_get(&xprt->kref);
-	return xprt;
+	if (atomic_inc_not_zero(&xprt->count))
+		return xprt;
+	return NULL;
 }
 }

+ 42 - 44
net/sunrpc/xprtrdma/rpc_rdma.c

@@ -87,6 +87,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
 	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
 {
 {
 	int len, n = 0, p;
 	int len, n = 0, p;
+	int page_base;
+	struct page **ppages;
 
 
 	if (pos == 0 && xdrbuf->head[0].iov_len) {
 	if (pos == 0 && xdrbuf->head[0].iov_len) {
 		seg[n].mr_page = NULL;
 		seg[n].mr_page = NULL;
@@ -95,34 +97,32 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 		++n;
 		++n;
 	}
 	}
 
 
-	if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) {
-		if (n == nsegs)
-			return 0;
-		seg[n].mr_page = xdrbuf->pages[0];
-		seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base;
-		seg[n].mr_len = min_t(u32,
-			PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len);
-		len = xdrbuf->page_len - seg[n].mr_len;
+	len = xdrbuf->page_len;
+	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
+	page_base = xdrbuf->page_base & ~PAGE_MASK;
+	p = 0;
+	while (len && n < nsegs) {
+		seg[n].mr_page = ppages[p];
+		seg[n].mr_offset = (void *)(unsigned long) page_base;
+		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+		BUG_ON(seg[n].mr_len > PAGE_SIZE);
+		len -= seg[n].mr_len;
 		++n;
 		++n;
-		p = 1;
-		while (len > 0) {
-			if (n == nsegs)
-				return 0;
-			seg[n].mr_page = xdrbuf->pages[p];
-			seg[n].mr_offset = NULL;
-			seg[n].mr_len = min_t(u32, PAGE_SIZE, len);
-			len -= seg[n].mr_len;
-			++n;
-			++p;
-		}
+		++p;
+		page_base = 0;	/* page offset only applies to first page */
 	}
 	}
 
 
+	/* Message overflows the seg array */
+	if (len && n == nsegs)
+		return 0;
+
 	if (xdrbuf->tail[0].iov_len) {
 	if (xdrbuf->tail[0].iov_len) {
 		/* the rpcrdma protocol allows us to omit any trailing
 		/* the rpcrdma protocol allows us to omit any trailing
 		 * xdr pad bytes, saving the server an RDMA operation. */
 		 * xdr pad bytes, saving the server an RDMA operation. */
 		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
 		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
 			return n;
 			return n;
 		if (n == nsegs)
 		if (n == nsegs)
+			/* Tail remains, but we're out of segments */
 			return 0;
 			return 0;
 		seg[n].mr_page = NULL;
 		seg[n].mr_page = NULL;
 		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
 		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
@@ -296,6 +296,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 	int copy_len;
 	int copy_len;
 	unsigned char *srcp, *destp;
 	unsigned char *srcp, *destp;
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	int page_base;
+	struct page **ppages;
 
 
 	destp = rqst->rq_svec[0].iov_base;
 	destp = rqst->rq_svec[0].iov_base;
 	curlen = rqst->rq_svec[0].iov_len;
 	curlen = rqst->rq_svec[0].iov_len;
@@ -324,28 +326,25 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 			__func__, destp + copy_len, curlen);
 			__func__, destp + copy_len, curlen);
 		rqst->rq_svec[0].iov_len += curlen;
 		rqst->rq_svec[0].iov_len += curlen;
 	}
 	}
-
 	r_xprt->rx_stats.pullup_copy_count += copy_len;
 	r_xprt->rx_stats.pullup_copy_count += copy_len;
-	npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
+
+	page_base = rqst->rq_snd_buf.page_base;
+	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
+	page_base &= ~PAGE_MASK;
+	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
 	for (i = 0; copy_len && i < npages; i++) {
 	for (i = 0; copy_len && i < npages; i++) {
-		if (i == 0)
-			curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base;
-		else
-			curlen = PAGE_SIZE;
+		curlen = PAGE_SIZE - page_base;
 		if (curlen > copy_len)
 		if (curlen > copy_len)
 			curlen = copy_len;
 			curlen = copy_len;
 		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
 		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
 			__func__, i, destp, copy_len, curlen);
 			__func__, i, destp, copy_len, curlen);
-		srcp = kmap_atomic(rqst->rq_snd_buf.pages[i],
-					KM_SKB_SUNRPC_DATA);
-		if (i == 0)
-			memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen);
-		else
-			memcpy(destp, srcp, curlen);
+		srcp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA);
+		memcpy(destp, srcp+page_base, curlen);
 		kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
 		kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
 		rqst->rq_svec[0].iov_len += curlen;
 		rqst->rq_svec[0].iov_len += curlen;
 		destp += curlen;
 		destp += curlen;
 		copy_len -= curlen;
 		copy_len -= curlen;
+		page_base = 0;
 	}
 	}
 	/* header now contains entire send message */
 	/* header now contains entire send message */
 	return pad;
 	return pad;
@@ -606,6 +605,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
 {
 	int i, npages, curlen, olen;
 	int i, npages, curlen, olen;
 	char *destp;
 	char *destp;
+	struct page **ppages;
+	int page_base;
 
 
 	curlen = rqst->rq_rcv_buf.head[0].iov_len;
 	curlen = rqst->rq_rcv_buf.head[0].iov_len;
 	if (curlen > copy_len) {	/* write chunk header fixup */
 	if (curlen > copy_len) {	/* write chunk header fixup */
@@ -624,32 +625,29 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 	olen = copy_len;
 	olen = copy_len;
 	i = 0;
 	i = 0;
 	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
 	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+	page_base = rqst->rq_rcv_buf.page_base;
+	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
+	page_base &= ~PAGE_MASK;
+
 	if (copy_len && rqst->rq_rcv_buf.page_len) {
 	if (copy_len && rqst->rq_rcv_buf.page_len) {
-		npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base +
+		npages = PAGE_ALIGN(page_base +
 			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
 			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
 		for (; i < npages; i++) {
 		for (; i < npages; i++) {
-			if (i == 0)
-				curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base;
-			else
-				curlen = PAGE_SIZE;
+			curlen = PAGE_SIZE - page_base;
 			if (curlen > copy_len)
 			if (curlen > copy_len)
 				curlen = copy_len;
 				curlen = copy_len;
 			dprintk("RPC:       %s: page %d"
 			dprintk("RPC:       %s: page %d"
 				" srcp 0x%p len %d curlen %d\n",
 				" srcp 0x%p len %d curlen %d\n",
 				__func__, i, srcp, copy_len, curlen);
 				__func__, i, srcp, copy_len, curlen);
-			destp = kmap_atomic(rqst->rq_rcv_buf.pages[i],
-						KM_SKB_SUNRPC_DATA);
-			if (i == 0)
-				memcpy(destp + rqst->rq_rcv_buf.page_base,
-						srcp, curlen);
-			else
-				memcpy(destp, srcp, curlen);
-			flush_dcache_page(rqst->rq_rcv_buf.pages[i]);
+			destp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA);
+			memcpy(destp + page_base, srcp, curlen);
+			flush_dcache_page(ppages[i]);
 			kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
 			kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
 			srcp += curlen;
 			srcp += curlen;
 			copy_len -= curlen;
 			copy_len -= curlen;
 			if (copy_len == 0)
 			if (copy_len == 0)
 				break;
 				break;
+			page_base = 0;
 		}
 		}
 		rqst->rq_rcv_buf.page_len = olen - copy_len;
 		rqst->rq_rcv_buf.page_len = olen - copy_len;
 	} else
 	} else

+ 45 - 8
net/sunrpc/xprtrdma/verbs.c

@@ -144,6 +144,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
 static inline
 static inline
 void rpcrdma_event_process(struct ib_wc *wc)
 void rpcrdma_event_process(struct ib_wc *wc)
 {
 {
+	struct rpcrdma_mw *frmr;
 	struct rpcrdma_rep *rep =
 	struct rpcrdma_rep *rep =
 			(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
 			(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
 
 
@@ -154,15 +155,23 @@ void rpcrdma_event_process(struct ib_wc *wc)
 		return;
 		return;
 
 
 	if (IB_WC_SUCCESS != wc->status) {
 	if (IB_WC_SUCCESS != wc->status) {
-		dprintk("RPC:       %s: %s WC status %X, connection lost\n",
-			__func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
-			 wc->status);
+		dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
+			__func__, wc->opcode, wc->status);
 		rep->rr_len = ~0U;
 		rep->rr_len = ~0U;
-		rpcrdma_schedule_tasklet(rep);
+		if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
+			rpcrdma_schedule_tasklet(rep);
 		return;
 		return;
 	}
 	}
 
 
 	switch (wc->opcode) {
 	switch (wc->opcode) {
+	case IB_WC_FAST_REG_MR:
+		frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+		frmr->r.frmr.state = FRMR_IS_VALID;
+		break;
+	case IB_WC_LOCAL_INV:
+		frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+		frmr->r.frmr.state = FRMR_IS_INVALID;
+		break;
 	case IB_WC_RECV:
 	case IB_WC_RECV:
 		rep->rr_len = wc->byte_len;
 		rep->rr_len = wc->byte_len;
 		ib_dma_sync_single_for_cpu(
 		ib_dma_sync_single_for_cpu(
@@ -1450,6 +1459,12 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
 		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
 		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
 				seg->mr_offset,
 				seg->mr_offset,
 				seg->mr_dmalen, seg->mr_dir);
 				seg->mr_dmalen, seg->mr_dir);
+	if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
+		dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
+			__func__,
+			(unsigned long long)seg->mr_dma,
+			seg->mr_offset, seg->mr_dmalen);
+	}
 }
 }
 
 
 static void
 static void
@@ -1469,7 +1484,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 			struct rpcrdma_xprt *r_xprt)
 			struct rpcrdma_xprt *r_xprt)
 {
 {
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_mr_seg *seg1 = seg;
-	struct ib_send_wr frmr_wr, *bad_wr;
+	struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
+
 	u8 key;
 	u8 key;
 	int len, pageoff;
 	int len, pageoff;
 	int i, rc;
 	int i, rc;
@@ -1484,6 +1500,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 		rpcrdma_map_one(ia, seg, writing);
 		rpcrdma_map_one(ia, seg, writing);
 		seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
 		seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
 		len += seg->mr_len;
 		len += seg->mr_len;
+		BUG_ON(seg->mr_len > PAGE_SIZE);
 		++seg;
 		++seg;
 		++i;
 		++i;
 		/* Check for holes */
 		/* Check for holes */
@@ -1494,26 +1511,45 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
 	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
 		__func__, seg1->mr_chunk.rl_mw, i);
 		__func__, seg1->mr_chunk.rl_mw, i);
 
 
+	if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
+		dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
+			__func__,
+			seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
+		/* Invalidate before using. */
+		memset(&invalidate_wr, 0, sizeof invalidate_wr);
+		invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+		invalidate_wr.next = &frmr_wr;
+		invalidate_wr.opcode = IB_WR_LOCAL_INV;
+		invalidate_wr.send_flags = IB_SEND_SIGNALED;
+		invalidate_wr.ex.invalidate_rkey =
+			seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+		DECR_CQCOUNT(&r_xprt->rx_ep);
+		post_wr = &invalidate_wr;
+	} else
+		post_wr = &frmr_wr;
+
 	/* Bump the key */
 	/* Bump the key */
 	key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
 	key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
 	ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
 
 
 	/* Prepare FRMR WR */
 	/* Prepare FRMR WR */
 	memset(&frmr_wr, 0, sizeof frmr_wr);
 	memset(&frmr_wr, 0, sizeof frmr_wr);
+	frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
 	frmr_wr.opcode = IB_WR_FAST_REG_MR;
 	frmr_wr.opcode = IB_WR_FAST_REG_MR;
-	frmr_wr.send_flags = 0;			/* unsignaled */
+	frmr_wr.send_flags = IB_SEND_SIGNALED;
 	frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
 	frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
 	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
 	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
 	frmr_wr.wr.fast_reg.page_list_len = i;
 	frmr_wr.wr.fast_reg.page_list_len = i;
 	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
+	BUG_ON(frmr_wr.wr.fast_reg.length < len);
 	frmr_wr.wr.fast_reg.access_flags = (writing ?
 	frmr_wr.wr.fast_reg.access_flags = (writing ?
 				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
 				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
 				IB_ACCESS_REMOTE_READ);
 				IB_ACCESS_REMOTE_READ);
 	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
 
-	rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
+	rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
 
 
 	if (rc) {
 	if (rc) {
 		dprintk("RPC:       %s: failed ib_post_send for register,"
 		dprintk("RPC:       %s: failed ib_post_send for register,"
@@ -1542,8 +1578,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
 		rpcrdma_unmap_one(ia, seg++);
 		rpcrdma_unmap_one(ia, seg++);
 
 
 	memset(&invalidate_wr, 0, sizeof invalidate_wr);
 	memset(&invalidate_wr, 0, sizeof invalidate_wr);
+	invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
 	invalidate_wr.opcode = IB_WR_LOCAL_INV;
 	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.send_flags = 0;			/* unsignaled */
+	invalidate_wr.send_flags = IB_SEND_SIGNALED;
 	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
 

+ 1 - 0
net/sunrpc/xprtrdma/xprt_rdma.h

@@ -164,6 +164,7 @@ struct rpcrdma_mr_seg {		/* chunk descriptors */
 				struct {
 				struct {
 					struct ib_fast_reg_page_list *fr_pgl;
 					struct ib_fast_reg_page_list *fr_pgl;
 					struct ib_mr *fr_mr;
 					struct ib_mr *fr_mr;
+					enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
 				} frmr;
 				} frmr;
 			} r;
 			} r;
 			struct list_head mw_list;
 			struct list_head mw_list;