Browse Source

Merge tag 'nfs-for-4.3-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable patches:
   - Fix atomicity of pNFS commit list updates
   - Fix NFSv4 handling of open(O_CREAT|O_EXCL|O_RDONLY)
   - nfs_set_pgio_error sometimes misses errors
   - Fix a thinko in xs_connect()
   - Fix borkage in _same_data_server_addrs_locked()
   - Fix a NULL pointer dereference of migration recovery ops for v4.2
     client
   - Don't let the ctime override attribute barriers.
   - Revert "NFSv4: Remove incorrect check in can_open_delegated()"
   - Ensure flexfiles pNFS driver updates the inode after write finishes
   - flexfiles must not pollute the attribute cache with attrbutes from
     the DS
   - Fix a protocol error in layoutreturn
   - Fix a protocol issue with NFSv4.1 CLOSE stateids

  Bugfixes + cleanups
   - pNFS blocks bugfixes from Christoph
   - Various cleanups from Anna
   - More fixes for delegation corner cases
   - Don't fsync twice for O_SYNC/IS_SYNC files
   - Fix pNFS and flexfiles layoutstats bugs
   - pnfs/flexfiles: avoid duplicate tracking of mirror data
   - pnfs: Fix layoutget/layoutreturn/return-on-close serialisation
     issues
   - pnfs/flexfiles: error handling retries a layoutget before fallback
     to MDS

  Features:
   - Full support for the OPEN NFS4_CREATE_EXCLUSIVE4_1 mode from
     Kinglong
   - More RDMA client transport improvements from Chuck
   - Removal of the deprecated ib_reg_phys_mr() and ib_rereg_phys_mr()
     verbs from the SUNRPC, Lustre and core infiniband tree.
   - Optimise away the close-to-open getattr if there is no cached data"

* tag 'nfs-for-4.3-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (108 commits)
  NFSv4: Respect the server imposed limit on how many changes we may cache
  NFSv4: Express delegation limit in units of pages
  Revert "NFS: Make close(2) asynchronous when closing NFS O_DIRECT files"
  NFS: Optimise away the close-to-open getattr if there is no cached data
  NFSv4.1/flexfiles: Clean up ff_layout_write_done_cb/ff_layout_commit_done_cb
  NFSv4.1/flexfiles: Mark the layout for return in ff_layout_io_track_ds_error()
  nfs: Remove unneeded checking of the return value from scnprintf
  nfs: Fix truncated client owner id without proto type
  NFSv4.1/flexfiles: Mark layout for return if the mirrors are invalid
  NFSv4.1/flexfiles: RW layouts are valid only if all mirrors are valid
  NFSv4.1/flexfiles: Fix incorrect usage of pnfs_generic_mark_devid_invalid()
  NFSv4.1/flexfiles: Fix freeing of mirrors
  NFSv4.1/pNFS: Don't request a minimal read layout beyond the end of file
  NFSv4.1/pnfs: Handle LAYOUTGET return values correctly
  NFSv4.1/pnfs: Don't ask for a read layout for an empty file.
  NFSv4.1: Fix a protocol issue with CLOSE stateids
  NFSv4.1/flexfiles: Don't mark the entire deviceid as bad for file errors
  SUNRPC: Prevent SYN+SYNACK+RST storms
  SUNRPC: xs_reset_transport must mark the connection as disconnected
  NFSv4.1/pnfs: Ensure layoutreturn reserves space for the opaque payload
  ...
Linus Torvalds 10 years ago
parent
commit
4e4adb2f46
54 changed files with 1321 additions and 1089 deletions
  1. 9 0
      Documentation/kernel-parameters.txt
  2. 0 67
      drivers/infiniband/core/verbs.c
  3. 1 18
      fs/nfs/blocklayout/blocklayout.h
  4. 7 2
      fs/nfs/blocklayout/dev.c
  5. 11 8
      fs/nfs/blocklayout/extent_tree.c
  6. 0 4
      fs/nfs/callback.c
  7. 7 2
      fs/nfs/callback_proc.c
  8. 2 111
      fs/nfs/client.c
  9. 27 2
      fs/nfs/delegation.c
  10. 2 1
      fs/nfs/delegation.h
  11. 6 14
      fs/nfs/dir.c
  12. 6 15
      fs/nfs/file.c
  13. 291 133
      fs/nfs/flexfilelayout/flexfilelayout.c
  14. 4 1
      fs/nfs/flexfilelayout/flexfilelayout.h
  15. 63 19
      fs/nfs/flexfilelayout/flexfilelayoutdev.c
  16. 34 27
      fs/nfs/inode.c
  17. 10 10
      fs/nfs/internal.h
  18. 1 0
      fs/nfs/nfs3xdr.c
  19. 0 2
      fs/nfs/nfs42.h
  20. 2 3
      fs/nfs/nfs42xdr.c
  21. 1 3
      fs/nfs/nfs4_fs.h
  22. 1 4
      fs/nfs/nfs4client.c
  23. 29 3
      fs/nfs/nfs4file.c
  24. 2 12
      fs/nfs/nfs4idmap.c
  25. 84 52
      fs/nfs/nfs4proc.c
  26. 1 11
      fs/nfs/nfs4state.c
  27. 61 0
      fs/nfs/nfs4trace.h
  28. 54 21
      fs/nfs/nfs4xdr.c
  29. 2 2
      fs/nfs/pagelist.c
  30. 135 92
      fs/nfs/pnfs.c
  31. 38 10
      fs/nfs/pnfs.h
  32. 54 34
      fs/nfs/pnfs_nfs.c
  33. 5 2
      fs/nfs/super.c
  34. 25 11
      fs/nfs/write.c
  35. 1 1
      fs/nfsd/blocklayoutxdr.c
  36. 0 15
      fs/nfsd/blocklayoutxdr.h
  37. 18 0
      include/linux/nfs4.h
  38. 1 1
      include/linux/nfs_fs.h
  39. 5 0
      include/linux/nfs_fs_sb.h
  40. 5 3
      include/linux/nfs_xdr.h
  41. 20 7
      include/linux/sunrpc/addr.h
  42. 6 2
      include/linux/sunrpc/auth.h
  43. 1 1
      include/linux/sunrpc/xprtrdma.h
  44. 0 46
      include/rdma/ib_verbs.h
  45. 1 1
      include/uapi/linux/nfs4.h
  46. 1 1
      net/sunrpc/auth_unix.c
  47. 19 0
      net/sunrpc/xprtrdma/fmr_ops.c
  48. 5 0
      net/sunrpc/xprtrdma/frwr_ops.c
  49. 24 1
      net/sunrpc/xprtrdma/physical_ops.c
  50. 101 96
      net/sunrpc/xprtrdma/rpc_rdma.c
  51. 35 42
      net/sunrpc/xprtrdma/transport.c
  52. 79 155
      net/sunrpc/xprtrdma/verbs.c
  53. 13 14
      net/sunrpc/xprtrdma/xprt_rdma.h
  54. 11 7
      net/sunrpc/xprtsock.c

+ 9 - 0
Documentation/kernel-parameters.txt

@@ -2285,6 +2285,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			The default parameter value of '0' causes the kernel
 			The default parameter value of '0' causes the kernel
 			not to attempt recovery of lost locks.
 			not to attempt recovery of lost locks.
 
 
+	nfs4.layoutstats_timer =
+			[NFSv4.2] Change the rate at which the kernel sends
+			layoutstats to the pNFS metadata server.
+
+			Setting this to value to 0 causes the kernel to use
+			whatever value is the default set by the layout
+			driver. A non-zero value sets the minimum interval
+			in seconds between layoutstats transmissions.
+
 	nfsd.nfs4_disable_idmapping=
 	nfsd.nfs4_disable_idmapping=
 			[NFSv4] When set to the default of '1', the NFSv4
 			[NFSv4] When set to the default of '1', the NFSv4
 			server will return only numeric uids and gids to
 			server will return only numeric uids and gids to

+ 0 - 67
drivers/infiniband/core/verbs.c

@@ -1144,73 +1144,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 }
 }
 EXPORT_SYMBOL(ib_get_dma_mr);
 EXPORT_SYMBOL(ib_get_dma_mr);
 
 
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
-			     struct ib_phys_buf *phys_buf_array,
-			     int num_phys_buf,
-			     int mr_access_flags,
-			     u64 *iova_start)
-{
-	struct ib_mr *mr;
-	int err;
-
-	err = ib_check_mr_access(mr_access_flags);
-	if (err)
-		return ERR_PTR(err);
-
-	if (!pd->device->reg_phys_mr)
-		return ERR_PTR(-ENOSYS);
-
-	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
-				     mr_access_flags, iova_start);
-
-	if (!IS_ERR(mr)) {
-		mr->device  = pd->device;
-		mr->pd      = pd;
-		mr->uobject = NULL;
-		atomic_inc(&pd->usecnt);
-		atomic_set(&mr->usecnt, 0);
-	}
-
-	return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
-		     int mr_rereg_mask,
-		     struct ib_pd *pd,
-		     struct ib_phys_buf *phys_buf_array,
-		     int num_phys_buf,
-		     int mr_access_flags,
-		     u64 *iova_start)
-{
-	struct ib_pd *old_pd;
-	int ret;
-
-	ret = ib_check_mr_access(mr_access_flags);
-	if (ret)
-		return ret;
-
-	if (!mr->device->rereg_phys_mr)
-		return -ENOSYS;
-
-	if (atomic_read(&mr->usecnt))
-		return -EBUSY;
-
-	old_pd = mr->pd;
-
-	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
-					phys_buf_array, num_phys_buf,
-					mr_access_flags, iova_start);
-
-	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
-		atomic_dec(&old_pd->usecnt);
-		atomic_inc(&pd->usecnt);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
 int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
 int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
 {
 {
 	return mr->device->query_mr ?
 	return mr->device->query_mr ?

+ 1 - 18
fs/nfs/blocklayout/blocklayout.h

@@ -46,13 +46,6 @@
 
 
 struct pnfs_block_dev;
 struct pnfs_block_dev;
 
 
-enum pnfs_block_volume_type {
-	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
-	PNFS_BLOCK_VOLUME_SLICE		= 1,
-	PNFS_BLOCK_VOLUME_CONCAT	= 2,
-	PNFS_BLOCK_VOLUME_STRIPE	= 3,
-};
-
 #define PNFS_BLOCK_MAX_UUIDS	4
 #define PNFS_BLOCK_MAX_UUIDS	4
 #define PNFS_BLOCK_MAX_DEVICES	64
 #define PNFS_BLOCK_MAX_DEVICES	64
 
 
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
 			struct pnfs_block_dev_map *map);
 			struct pnfs_block_dev_map *map);
 };
 };
 
 
-enum exstate4 {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
-	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
-};
-
 /* sector_t fields are all in 512-byte sectors */
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
 struct pnfs_block_extent {
 	union {
 	union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
-	enum exstate4	be_state;	/* the state of this extent */
+	enum pnfs_block_extent_state be_state;	/* the state of this extent */
 #define EXTENT_WRITTEN		1
 #define EXTENT_WRITTEN		1
 #define EXTENT_COMMITTING	2
 #define EXTENT_COMMITTING	2
 	unsigned int	be_tag;
 	unsigned int	be_tag;
 };
 };
 
 
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
 struct pnfs_block_layout {
 struct pnfs_block_layout {
 	struct pnfs_layout_hdr	bl_layout;
 	struct pnfs_layout_hdr	bl_layout;
 	struct rb_root		bl_ext_rw;
 	struct rb_root		bl_ext_rw;

+ 7 - 2
fs/nfs/blocklayout/dev.c

@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
 		kfree(dev->children);
 		kfree(dev->children);
 	} else {
 	} else {
 		if (dev->bdev)
 		if (dev->bdev)
-			blkdev_put(dev->bdev, FMODE_READ);
+			blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
 	}
 	}
 }
 }
 
 
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 				return -EIO;
 				return -EIO;
 			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
 			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
 			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
 			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+			if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+				pr_info("signature too long: %d\n",
+					b->simple.sigs[i].sig_len);
+				return -EIO;
+			}
 
 
 			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
 			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
 			if (!p)
 			if (!p)
@@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 	if (!dev)
 	if (!dev)
 		return -EIO;
 		return -EIO;
 
 
-	d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+	d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
 	if (IS_ERR(d->bdev)) {
 	if (IS_ERR(d->bdev)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
 			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
 			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));

+ 11 - 8
fs/nfs/blocklayout/extent_tree.c

@@ -462,6 +462,12 @@ out:
 	return err;
 	return err;
 }
 }
 
 
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+	return sizeof(__be32) /* number of entries */ +
+		PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 		size_t buffer_size)
 		size_t buffer_size)
 {
 {
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 			continue;
 			continue;
 
 
 		(*count)++;
 		(*count)++;
-		if (*count * BL_EXTENT_SIZE > buffer_size) {
+		if (ext_tree_layoutupdate_size(*count) > buffer_size) {
 			/* keep counting.. */
 			/* keep counting.. */
 			ret = -ENOSPC;
 			ret = -ENOSPC;
 			continue;
 			continue;
@@ -530,7 +536,7 @@ retry:
 	if (unlikely(ret)) {
 	if (unlikely(ret)) {
 		ext_tree_free_commitdata(arg, buffer_size);
 		ext_tree_free_commitdata(arg, buffer_size);
 
 
-		buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+		buffer_size = ext_tree_layoutupdate_size(count);
 		count = 0;
 		count = 0;
 
 
 		arg->layoutupdate_pages =
 		arg->layoutupdate_pages =
@@ -549,17 +555,14 @@ retry:
 	}
 	}
 
 
 	*start_p = cpu_to_be32(count);
 	*start_p = cpu_to_be32(count);
-	arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
 
 
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
-		__be32 *p = start_p;
+		void *p = start_p, *end = p + arg->layoutupdate_len;
 		int i = 0;
 		int i = 0;
 
 
-		for (p = start_p;
-		     p < start_p + arg->layoutupdate_len;
-		     p += PAGE_SIZE) {
+		for ( ; p < end; p += PAGE_SIZE)
 			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
 			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
-		}
 	}
 	}
 
 
 	dprintk("%s found %zu ranges\n", __func__, count);
 	dprintk("%s found %zu ranges\n", __func__, count);

+ 0 - 4
fs/nfs/callback.c

@@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv)
 	spin_lock_init(&serv->sv_cb_lock);
 	spin_lock_init(&serv->sv_cb_lock);
 	init_waitqueue_head(&serv->sv_cb_waitq);
 	init_waitqueue_head(&serv->sv_cb_waitq);
 	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-	if (IS_ERR(rqstp)) {
-		svc_xprt_put(serv->sv_bc_xprt);
-		serv->sv_bc_xprt = NULL;
-	}
 	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	return rqstp;
 	return rqstp;
 }
 }

+ 7 - 2
fs/nfs/callback_proc.c

@@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
 
 	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
 	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
-	if (inode == NULL)
+	if (inode == NULL) {
+		trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+				-ntohl(res->status));
 		goto out;
 		goto out;
+	}
 	nfsi = NFS_I(inode);
 	nfsi = NFS_I(inode);
 	rcu_read_lock();
 	rcu_read_lock();
 	delegation = rcu_dereference(nfsi->delegation);
 	delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 	res->status = 0;
 	res->status = 0;
 out_iput:
 out_iput:
 	rcu_read_unlock();
 	rcu_read_unlock();
+	trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
 	iput(inode);
 	iput(inode);
 out:
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
@@ -194,6 +198,7 @@ unlock:
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
 	pnfs_free_lseg_list(&free_me_list);
 	pnfs_put_layout_hdr(lo);
 	pnfs_put_layout_hdr(lo);
+	trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
 	iput(ino);
 	iput(ino);
 out:
 out:
 	return rv;
 	return rv;
@@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
 	status = htonl(NFS4_OK);
 	status = htonl(NFS4_OK);
 
 
 	nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
 	nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
-	nfs41_server_notify_target_slotid_update(cps->clp);
+	nfs41_notify_server(cps->clp);
 out:
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 	return status;

+ 2 - 111
fs/nfs/client.c

@@ -20,6 +20,7 @@
 #include <linux/stat.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
 #include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
 #include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
 }
 }
 EXPORT_SYMBOL_GPL(nfs_put_client);
 EXPORT_SYMBOL_GPL(nfs_put_client);
 
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
-		return 0;
-	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
-		return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
-	return 1;
-}
-#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
-				const struct sockaddr *sa2)
-{
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
-		(sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
-				const struct sockaddr *sa2)
-{
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
-		(sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-			      const struct sockaddr *sa2)
-{
-	if (sa1->sa_family != sa2->sa_family)
-		return 0;
-
-	switch (sa1->sa_family) {
-	case AF_INET:
-		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
-	case AF_INET6:
-		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
-			    const struct sockaddr *sa2)
-{
-	if (sa1->sa_family != sa2->sa_family)
-		return 0;
-
-	switch (sa1->sa_family) {
-	case AF_INET:
-		return nfs_sockaddr_cmp_ip4(sa1, sa2);
-	case AF_INET6:
-		return nfs_sockaddr_cmp_ip6(sa1, sa2);
-	}
-	return 0;
-}
-
 /*
 /*
  * Find an nfs_client on the list that matches the initialisation data
  * Find an nfs_client on the list that matches the initialisation data
  * that is supplied.
  * that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 		if (clp->cl_minorversion != data->minorversion)
 		if (clp->cl_minorversion != data->minorversion)
 			continue;
 			continue;
 		/* Match the full socket address */
 		/* Match the full socket address */
-		if (!nfs_sockaddr_cmp(sap, clap))
+		if (!rpc_cmp_addr_port(sap, clap))
 			continue;
 			continue;
 
 
 		atomic_inc(&clp->cl_count);
 		atomic_inc(&clp->cl_count);

+ 27 - 2
fs/nfs/delegation.c

@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 		if (delegation->inode != NULL) {
 		if (delegation->inode != NULL) {
 			nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 			nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 			delegation->type = res->delegation_type;
 			delegation->type = res->delegation_type;
-			delegation->maxsize = res->maxsize;
+			delegation->pagemod_limit = res->pagemod_limit;
 			oldcred = delegation->cred;
 			oldcred = delegation->cred;
 			delegation->cred = get_rpccred(cred);
 			delegation->cred = get_rpccred(cred);
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		return -ENOMEM;
 		return -ENOMEM;
 	nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 	nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 	delegation->type = res->delegation_type;
 	delegation->type = res->delegation_type;
-	delegation->maxsize = res->maxsize;
+	delegation->pagemod_limit = res->pagemod_limit;
 	delegation->change_attr = inode->i_version;
 	delegation->change_attr = inode->i_version;
 	delegation->cred = get_rpccred(cred);
 	delegation->cred = get_rpccred(cred);
 	delegation->inode = inode;
 	delegation->inode = inode;
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
 	rcu_read_unlock();
 	rcu_read_unlock();
 	return ret;
 	return ret;
 }
 }
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+	bool ret = true;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
+	if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+		goto out;
+	if (nfsi->nrequests < delegation->pagemod_limit)
+		ret = false;
+out:
+	rcu_read_unlock();
+	return ret;
+}

+ 2 - 1
fs/nfs/delegation.h

@@ -18,7 +18,7 @@ struct nfs_delegation {
 	struct inode *inode;
 	struct inode *inode;
 	nfs4_stateid stateid;
 	nfs4_stateid stateid;
 	fmode_t type;
 	fmode_t type;
-	loff_t maxsize;
+	unsigned long pagemod_limit;
 	__u64 change_attr;
 	__u64 change_attr;
 	unsigned long flags;
 	unsigned long flags;
 	spinlock_t lock;
 	spinlock_t lock;
@@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
 
 
 #endif
 #endif
 
 

+ 6 - 14
fs/nfs/dir.c

@@ -583,26 +583,19 @@ out_nopages:
 }
 }
 
 
 static
 static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
 {
 {
 	unsigned int i;
 	unsigned int i;
 	for (i = 0; i < npages; i++)
 	for (i = 0; i < npages; i++)
 		put_page(pages[i]);
 		put_page(pages[i]);
 }
 }
 
 
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
-		unsigned int npages)
-{
-	nfs_readdir_free_pagearray(pages, npages);
-}
-
 /*
 /*
  * nfs_readdir_large_page will allocate pages that must be freed with a call
  * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
  */
  */
 static
 static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
 {
 {
 	unsigned int i;
 	unsigned int i;
 
 
@@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 	return 0;
 	return 0;
 
 
 out_freepages:
 out_freepages:
-	nfs_readdir_free_pagearray(pages, i);
+	nfs_readdir_free_pages(pages, i);
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
 
 
@@ -623,7 +616,6 @@ static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
 {
 	struct page *pages[NFS_MAX_READDIR_PAGES];
 	struct page *pages[NFS_MAX_READDIR_PAGES];
-	void *pages_ptr = NULL;
 	struct nfs_entry entry;
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
 	struct file	*file = desc->file;
 	struct nfs_cache_array *array;
 	struct nfs_cache_array *array;
@@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	array->eof_index = -1;
 	array->eof_index = -1;
 
 
-	status = nfs_readdir_large_page(pages, array_size);
+	status = nfs_readdir_alloc_pages(pages, array_size);
 	if (status < 0)
 	if (status < 0)
 		goto out_release_array;
 		goto out_release_array;
 	do {
 	do {
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		}
 		}
 	} while (array->eof_index < 0);
 	} while (array->eof_index < 0);
 
 
-	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+	nfs_readdir_free_pages(pages, array_size);
 out_release_array:
 out_release_array:
 	nfs_readdir_release_array(page);
 	nfs_readdir_release_array(page);
 out_label_free:
 out_label_free:

+ 6 - 15
fs/nfs/file.c

@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	dprintk("NFS: release(%pD2)\n", filp);
 	dprintk("NFS: release(%pD2)\n", filp);
 
 
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-	return nfs_release(inode, filp);
+	nfs_file_clear_open_context(filp);
+	return 0;
 }
 }
 EXPORT_SYMBOL_GPL(nfs_file_release);
 EXPORT_SYMBOL_GPL(nfs_file_release);
 
 
@@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
 /*
 /*
  * Flush all dirty pages, and check for write errors.
  * Flush all dirty pages, and check for write errors.
  */
  */
-int
+static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
 {
 	struct inode	*inode = file_inode(file);
 	struct inode	*inode = file_inode(file);
@@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	if ((file->f_mode & FMODE_WRITE) == 0)
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
 		return 0;
 
 
-	/*
-	 * If we're holding a write delegation, then just start the i/o
-	 * but don't wait for completion (or send a commit).
-	 */
-	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
-		return filemap_fdatawrite(file->f_mapping);
-
 	/* Flush writes to the server and return any errors */
 	/* Flush writes to the server and return any errors */
 	return vfs_fsync(file, 0);
 	return vfs_fsync(file, 0);
 }
 }
-EXPORT_SYMBOL_GPL(nfs_file_flush);
 
 
 ssize_t
 ssize_t
 nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
@@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
 	.page_mkwrite = nfs_vm_page_mkwrite,
 	.page_mkwrite = nfs_vm_page_mkwrite,
 };
 };
 
 
-static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode)
 {
 {
 	struct nfs_open_context *ctx;
 	struct nfs_open_context *ctx;
 
 
-	if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
-		return 1;
 	ctx = nfs_file_open_context(filp);
 	ctx = nfs_file_open_context(filp);
 	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
 	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
 	    nfs_ctx_key_to_expire(ctx))
 	    nfs_ctx_key_to_expire(ctx))
@@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	if (result > 0)
 	if (result > 0)
 		written = result;
 		written = result;
 
 
-	/* Return error values for O_DSYNC and IS_SYNC() */
-	if (result >= 0 && nfs_need_sync_write(file, inode)) {
+	/* Return error values */
+	if (result >= 0 && nfs_need_check_write(file, inode)) {
 		int err = vfs_fsync(file, 0);
 		int err = vfs_fsync(file, 0);
 		if (err < 0)
 		if (err < 0)
 			result = err;
 			result = err;

+ 291 - 133
fs/nfs/flexfilelayout/flexfilelayout.c

@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 	ffl = kzalloc(sizeof(*ffl), gfp_flags);
 	ffl = kzalloc(sizeof(*ffl), gfp_flags);
 	if (ffl) {
 	if (ffl) {
 		INIT_LIST_HEAD(&ffl->error_list);
 		INIT_LIST_HEAD(&ffl->error_list);
+		INIT_LIST_HEAD(&ffl->mirrors);
 		return &ffl->generic_hdr;
 		return &ffl->generic_hdr;
 	} else
 	} else
 		return NULL;
 		return NULL;
@@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
 	return 0;
 	return 0;
 }
 }
 
 
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+		const struct nfs4_ff_layout_mirror *m2)
+{
+	int i, j;
+
+	if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+		return false;
+	for (i = 0; i < m1->fh_versions_cnt; i++) {
+		bool found_fh = false;
+		for (j = 0; j < m2->fh_versions_cnt; i++) {
+			if (nfs_compare_fh(&m1->fh_versions[i],
+					&m2->fh_versions[j]) == 0) {
+				found_fh = true;
+				break;
+			}
+		}
+		if (!found_fh)
+			return false;
+	}
+	return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+		struct nfs4_ff_layout_mirror *mirror)
+{
+	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+	struct nfs4_ff_layout_mirror *pos;
+	struct inode *inode = lo->plh_inode;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+		if (mirror->mirror_ds != pos->mirror_ds)
+			continue;
+		if (!ff_mirror_match_fh(mirror, pos))
+			continue;
+		if (atomic_inc_not_zero(&pos->ref)) {
+			spin_unlock(&inode->i_lock);
+			return pos;
+		}
+	}
+	list_add(&mirror->mirrors, &ff_layout->mirrors);
+	mirror->layout = lo;
+	spin_unlock(&inode->i_lock);
+	return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+	struct inode *inode;
+	if (mirror->layout == NULL)
+		return;
+	inode = mirror->layout->plh_inode;
+	spin_lock(&inode->i_lock);
+	list_del(&mirror->mirrors);
+	spin_unlock(&inode->i_lock);
+	mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+
+	mirror = kzalloc(sizeof(*mirror), gfp_flags);
+	if (mirror != NULL) {
+		spin_lock_init(&mirror->lock);
+		atomic_set(&mirror->ref, 1);
+		INIT_LIST_HEAD(&mirror->mirrors);
+	}
+	return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+	ff_layout_remove_mirror(mirror);
+	kfree(mirror->fh_versions);
+	if (mirror->cred)
+		put_rpccred(mirror->cred);
+	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+	kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+		ff_layout_free_mirror(mirror);
+}
+
 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 {
 {
 	int i;
 	int i;
@@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 			/* normally mirror_ds is freed in
 			/* normally mirror_ds is freed in
 			 * .free_deviceid_node but we still do it here
 			 * .free_deviceid_node but we still do it here
 			 * for .alloc_lseg error path */
 			 * for .alloc_lseg error path */
-			if (fls->mirror_array[i]) {
-				kfree(fls->mirror_array[i]->fh_versions);
-				nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-				kfree(fls->mirror_array[i]);
-			}
+			ff_layout_put_mirror(fls->mirror_array[i]);
 		}
 		}
 		kfree(fls->mirror_array);
 		kfree(fls->mirror_array);
 		fls->mirror_array = NULL;
 		fls->mirror_array = NULL;
@@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 	}
 	}
 }
 }
 
 
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+		const struct pnfs_layout_range *l2)
+{
+	u64 end1, end2;
+
+	if (l1->iomode != l2->iomode)
+		return l1->iomode != IOMODE_READ;
+	end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+	end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+	if (end1 < l2->offset)
+		return false;
+	if (end2 < l1->offset)
+		return true;
+	return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+		struct pnfs_layout_segment *old)
+{
+	u64 new_end, old_end;
+
+	if (new->pls_range.iomode != old->pls_range.iomode)
+		return false;
+	old_end = pnfs_calc_offset_end(old->pls_range.offset,
+			old->pls_range.length);
+	if (old_end < new->pls_range.offset)
+		return false;
+	new_end = pnfs_calc_offset_end(new->pls_range.offset,
+			new->pls_range.length);
+	if (new_end < old->pls_range.offset)
+		return false;
+
+	/* Mergeable: copy info from 'old' to 'new' */
+	if (new_end < old_end)
+		new_end = old_end;
+	if (new->pls_range.offset < old->pls_range.offset)
+		new->pls_range.offset = old->pls_range.offset;
+	new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+			new_end);
+	if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+		set_bit(NFS_LSEG_ROC, &new->pls_flags);
+	if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+		set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+	return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_segment *lseg,
+		struct list_head *free_me)
+{
+	pnfs_generic_layout_insert_lseg(lo, lseg,
+			ff_lseg_range_is_after,
+			ff_lseg_merge,
+			free_me);
+}
+
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
 {
 	int i, j;
 	int i, j;
@@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		goto out_err_free;
 		goto out_err_free;
 
 
 	for (i = 0; i < fls->mirror_array_cnt; i++) {
 	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		struct nfs4_ff_layout_mirror *mirror;
 		struct nfs4_deviceid devid;
 		struct nfs4_deviceid devid;
 		struct nfs4_deviceid_node *idnode;
 		struct nfs4_deviceid_node *idnode;
 		u32 ds_count;
 		u32 ds_count;
@@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		if (ds_count != 1)
 		if (ds_count != 1)
 			goto out_err_free;
 			goto out_err_free;
 
 
-		fls->mirror_array[i] =
-			kzalloc(sizeof(struct nfs4_ff_layout_mirror),
-				gfp_flags);
+		fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
 		if (fls->mirror_array[i] == NULL) {
 		if (fls->mirror_array[i] == NULL) {
 			rc = -ENOMEM;
 			rc = -ENOMEM;
 			goto out_err_free;
 			goto out_err_free;
 		}
 		}
 
 
-		spin_lock_init(&fls->mirror_array[i]->lock);
 		fls->mirror_array[i]->ds_count = ds_count;
 		fls->mirror_array[i]->ds_count = ds_count;
-		fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
 
 		/* deviceid */
 		/* deviceid */
 		rc = decode_deviceid(&stream, &devid);
 		rc = decode_deviceid(&stream, &devid);
@@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		if (rc)
 		if (rc)
 			goto out_err_free;
 			goto out_err_free;
 
 
+		mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+		if (mirror != fls->mirror_array[i]) {
+			ff_layout_free_mirror(fls->mirror_array[i]);
+			fls->mirror_array[i] = mirror;
+		}
+
 		dprintk("%s: uid %d gid %d\n", __func__,
 		dprintk("%s: uid %d gid %d\n", __func__,
 			fls->mirror_array[i]->uid,
 			fls->mirror_array[i]->uid,
 			fls->mirror_array[i]->gid);
 			fls->mirror_array[i]->gid);
@@ -379,21 +527,9 @@ static void
 ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 {
 {
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
-	int i;
 
 
 	dprintk("--> %s\n", __func__);
 	dprintk("--> %s\n", __func__);
 
 
-	for (i = 0; i < fls->mirror_array_cnt; i++) {
-		if (fls->mirror_array[i]) {
-			nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-			fls->mirror_array[i]->mirror_ds = NULL;
-			if (fls->mirror_array[i]->cred) {
-				put_rpccred(fls->mirror_array[i]->cred);
-				fls->mirror_array[i]->cred = NULL;
-			}
-		}
-	}
-
 	if (lseg->pls_range.iomode == IOMODE_RW) {
 	if (lseg->pls_range.iomode == IOMODE_RW) {
 		struct nfs4_flexfile_layout *ffl;
 		struct nfs4_flexfile_layout *ffl;
 		struct inode *inode;
 		struct inode *inode;
@@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 }
 }
 
 
 static void
 static void
-nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
 {
 	/* first IO request? */
 	/* first IO request? */
 	if (atomic_inc_return(&timer->n_ops) == 1) {
 	if (atomic_inc_return(&timer->n_ops) == 1) {
-		timer->start_time = ktime_get();
+		timer->start_time = now;
 	}
 	}
 }
 }
 
 
 static ktime_t
 static ktime_t
-nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
 {
-	ktime_t start, now;
+	ktime_t start;
 
 
 	if (atomic_dec_return(&timer->n_ops) < 0)
 	if (atomic_dec_return(&timer->n_ops) < 0)
 		WARN_ON_ONCE(1);
 		WARN_ON_ONCE(1);
 
 
-	now = ktime_get();
 	start = timer->start_time;
 	start = timer->start_time;
 	timer->start_time = now;
 	timer->start_time = now;
 	return ktime_sub(now, start);
 	return ktime_sub(now, start);
 }
 }
 
 
-static ktime_t
-nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
-{
-	return ktime_sub(ktime_get(), task->tk_start);
-}
-
 static bool
 static bool
 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
-			    struct nfs4_ff_layoutstat *layoutstat)
+			    struct nfs4_ff_layoutstat *layoutstat,
+			    ktime_t now)
 {
 {
 	static const ktime_t notime = {0};
 	static const ktime_t notime = {0};
-	ktime_t now = ktime_get();
+	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
 
 
-	nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
 	if (ktime_equal(mirror->start_time, notime))
 	if (ktime_equal(mirror->start_time, notime))
 		mirror->start_time = now;
 		mirror->start_time = now;
 	if (ktime_equal(mirror->last_report_time, notime))
 	if (ktime_equal(mirror->last_report_time, notime))
 		mirror->last_report_time = now;
 		mirror->last_report_time = now;
+	if (layoutstats_timer != 0)
+		report_interval = (s64)layoutstats_timer * 1000LL;
 	if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
 	if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
-			FF_LAYOUTSTATS_REPORT_INTERVAL) {
+			report_interval) {
 		mirror->last_report_time = now;
 		mirror->last_report_time = now;
 		return true;
 		return true;
 	}
 	}
@@ -482,35 +614,39 @@ static void
 nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 		__u64 requested,
 		__u64 requested,
 		__u64 completed,
 		__u64 completed,
-		ktime_t time_completed)
+		ktime_t time_completed,
+		ktime_t time_started)
 {
 {
 	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
 	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+	ktime_t completion_time = ktime_sub(time_completed, time_started);
 	ktime_t timer;
 	ktime_t timer;
 
 
 	iostat->ops_completed++;
 	iostat->ops_completed++;
 	iostat->bytes_completed += completed;
 	iostat->bytes_completed += completed;
 	iostat->bytes_not_delivered += requested - completed;
 	iostat->bytes_not_delivered += requested - completed;
 
 
-	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
 	iostat->total_busy_time =
 	iostat->total_busy_time =
 			ktime_add(iostat->total_busy_time, timer);
 			ktime_add(iostat->total_busy_time, timer);
 	iostat->aggregate_completion_time =
 	iostat->aggregate_completion_time =
-			ktime_add(iostat->aggregate_completion_time, time_completed);
+			ktime_add(iostat->aggregate_completion_time,
+					completion_time);
 }
 }
 
 
 static void
 static void
-nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
-		__u64 requested)
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+		struct nfs4_ff_layout_mirror *mirror,
+		__u64 requested, ktime_t now)
 {
 {
 	bool report;
 	bool report;
 
 
 	spin_lock(&mirror->lock);
 	spin_lock(&mirror->lock);
-	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
 	spin_unlock(&mirror->lock);
 	spin_unlock(&mirror->lock);
 
 
 	if (report)
 	if (report)
-		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+		pnfs_report_layoutstat(inode, GFP_KERNEL);
 }
 }
 
 
 static void
 static void
@@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 	spin_lock(&mirror->lock);
 	spin_lock(&mirror->lock);
 	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
 	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
 			requested, completed,
 			requested, completed,
-			nfs4_ff_layout_calc_completion_time(task));
+			ktime_get(), task->tk_start);
 	spin_unlock(&mirror->lock);
 	spin_unlock(&mirror->lock);
 }
 }
 
 
 static void
 static void
-nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
-		__u64 requested)
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+		struct nfs4_ff_layout_mirror *mirror,
+		__u64 requested, ktime_t now)
 {
 {
 	bool report;
 	bool report;
 
 
 	spin_lock(&mirror->lock);
 	spin_lock(&mirror->lock);
-	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
 	spin_unlock(&mirror->lock);
 	spin_unlock(&mirror->lock);
 
 
 	if (report)
 	if (report)
-		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+		pnfs_report_layoutstat(inode, GFP_NOIO);
 }
 }
 
 
 static void
 static void
@@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 
 
 	spin_lock(&mirror->lock);
 	spin_lock(&mirror->lock);
 	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
 	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
-			requested, completed,
-			nfs4_ff_layout_calc_completion_time(task));
+			requested, completed, ktime_get(), task->tk_start);
 	spin_unlock(&mirror->lock);
 	spin_unlock(&mirror->lock);
 }
 }
 
 
@@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
 
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	/* no lseg means that pnfs is not in use, so no mirroring here */
-	pnfs_put_lseg(pgio->pg_lseg);
-	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_write_mds(pgio);
 	nfs_pageio_reset_write_mds(pgio);
 	return 1;
 	return 1;
 }
 }
@@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 	if (task->tk_status >= 0)
 	if (task->tk_status >= 0)
 		return 0;
 		return 0;
 
 
-	if (task->tk_status != -EJUKEBOX) {
+	switch (task->tk_status) {
+	/* File access problems. Don't mark the device as unavailable */
+	case -EACCES:
+	case -ESTALE:
+	case -EISDIR:
+	case -EBADHANDLE:
+	case -ELOOP:
+	case -ENOSPC:
+		break;
+	case -EJUKEBOX:
+		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+		goto out_retry;
+	default:
 		dprintk("%s DS connection error %d\n", __func__,
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
 			task->tk_status);
 		nfs4_mark_deviceid_unavailable(devid);
 		nfs4_mark_deviceid_unavailable(devid);
-		if (ff_layout_has_available_ds(lseg))
-			return -NFS4ERR_RESET_TO_PNFS;
-		else
-			return -NFS4ERR_RESET_TO_MDS;
 	}
 	}
-
-	if (task->tk_status == -EJUKEBOX)
-		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+	/* FIXME: Need to prevent infinite looping here. */
+	return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
 	task->tk_status = 0;
 	task->tk_status = 0;
 	rpc_restart_call(task);
 	rpc_restart_call(task);
 	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
 	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 
 
 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 					int idx, u64 offset, u64 length,
 					int idx, u64 offset, u64 length,
-					u32 status, int opnum)
+					u32 status, int opnum, int error)
 {
 {
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_ff_layout_mirror *mirror;
 	int err;
 	int err;
 
 
+	if (status == 0) {
+		switch (error) {
+		case -ETIMEDOUT:
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+		case -EOPNOTSUPP:
+		case -ECONNREFUSED:
+		case -ECONNRESET:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ENETUNREACH:
+		case -EADDRINUSE:
+		case -ENOBUFS:
+		case -EPIPE:
+		case -EPERM:
+			status = NFS4ERR_NXIO;
+			break;
+		case -EACCES:
+			status = NFS4ERR_ACCESS;
+			break;
+		default:
+			return;
+		}
+	}
+
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
 				       mirror, offset, length, status, opnum,
 				       GFP_NOIO);
 				       GFP_NOIO);
+	pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
 	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 }
 }
 
 
@@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 static int ff_layout_read_done_cb(struct rpc_task *task,
 static int ff_layout_read_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 				struct nfs_pgio_header *hdr)
 {
 {
-	struct inode *inode;
 	int err;
 	int err;
 
 
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
-	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-		hdr->res.op_status = NFS4ERR_NXIO;
-	if (task->tk_status < 0 && hdr->res.op_status)
+	if (task->tk_status < 0)
 		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 					    hdr->args.offset, hdr->args.count,
 					    hdr->args.offset, hdr->args.count,
-					    hdr->res.op_status, OP_READ);
+					    hdr->res.op_status, OP_READ,
+					    task->tk_status);
 	err = ff_layout_async_handle_error(task, hdr->args.context->state,
 	err = ff_layout_async_handle_error(task, hdr->args.context->state,
 					   hdr->ds_clp, hdr->lseg,
 					   hdr->ds_clp, hdr->lseg,
 					   hdr->pgio_mirror_idx);
 					   hdr->pgio_mirror_idx);
@@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 		pnfs_read_resend_pnfs(hdr);
 		pnfs_read_resend_pnfs(hdr);
 		return task->tk_status;
 		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
 	case -NFS4ERR_RESET_TO_MDS:
-		inode = hdr->lseg->pls_layout->plh_inode;
-		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
 		ff_layout_reset_read(hdr);
 		ff_layout_reset_read(hdr);
 		return task->tk_status;
 		return task->tk_status;
 	case -EAGAIN:
 	case -EAGAIN:
@@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
 static int ff_layout_read_prepare_common(struct rpc_task *task,
 					 struct nfs_pgio_header *hdr)
 					 struct nfs_pgio_header *hdr)
 {
 {
-	nfs4_ff_layout_stat_io_start_read(
+	nfs4_ff_layout_stat_io_start_read(hdr->inode,
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count);
+			hdr->args.count,
+			task->tk_start);
 
 
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		rpc_exit(task, -EIO);
@@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
 static int ff_layout_write_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 				struct nfs_pgio_header *hdr)
 {
 {
-	struct inode *inode;
 	int err;
 	int err;
 
 
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
-	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-		hdr->res.op_status = NFS4ERR_NXIO;
-	if (task->tk_status < 0 && hdr->res.op_status)
+	if (task->tk_status < 0)
 		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 					    hdr->args.offset, hdr->args.count,
 					    hdr->args.offset, hdr->args.count,
-					    hdr->res.op_status, OP_WRITE);
+					    hdr->res.op_status, OP_WRITE,
+					    task->tk_status);
 	err = ff_layout_async_handle_error(task, hdr->args.context->state,
 	err = ff_layout_async_handle_error(task, hdr->args.context->state,
 					   hdr->ds_clp, hdr->lseg,
 					   hdr->ds_clp, hdr->lseg,
 					   hdr->pgio_mirror_idx);
 					   hdr->pgio_mirror_idx);
 
 
 	switch (err) {
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
 	case -NFS4ERR_RESET_TO_PNFS:
+		pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+		ff_layout_reset_write(hdr, true);
+		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
 	case -NFS4ERR_RESET_TO_MDS:
-		inode = hdr->lseg->pls_layout->plh_inode;
-		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
-		if (err == -NFS4ERR_RESET_TO_PNFS) {
-			pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
-			ff_layout_reset_write(hdr, true);
-		} else {
-			pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
-			ff_layout_reset_write(hdr, false);
-		}
+		pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+		ff_layout_reset_write(hdr, false);
 		return task->tk_status;
 		return task->tk_status;
 	case -EAGAIN:
 	case -EAGAIN:
 		rpc_restart_call_prepare(task);
 		rpc_restart_call_prepare(task);
@@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	    hdr->res.verf->committed == NFS_DATA_SYNC)
 	    hdr->res.verf->committed == NFS_DATA_SYNC)
 		ff_layout_set_layoutcommit(hdr);
 		ff_layout_set_layoutcommit(hdr);
 
 
+	/* zero out fattr since we don't care DS attr at all */
+	hdr->fattr.valid = 0;
+	if (task->tk_status >= 0)
+		nfs_writeback_update_inode(hdr);
+
 	return 0;
 	return 0;
 }
 }
 
 
 static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_commit_done_cb(struct rpc_task *task,
 				     struct nfs_commit_data *data)
 				     struct nfs_commit_data *data)
 {
 {
-	struct inode *inode;
 	int err;
 	int err;
 
 
 	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
 	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
-	if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
-		data->res.op_status = NFS4ERR_NXIO;
-	if (task->tk_status < 0 && data->res.op_status)
+	if (task->tk_status < 0)
 		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
 		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
 					    data->args.offset, data->args.count,
 					    data->args.offset, data->args.count,
-					    data->res.op_status, OP_COMMIT);
+					    data->res.op_status, OP_COMMIT,
+					    task->tk_status);
 	err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
 	err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
 					   data->lseg, data->ds_commit_index);
 					   data->lseg, data->ds_commit_index);
 
 
 	switch (err) {
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
 	case -NFS4ERR_RESET_TO_PNFS:
+		pnfs_set_retry_layoutget(data->lseg->pls_layout);
+		pnfs_generic_prepare_to_resend_writes(data);
+		return -EAGAIN;
 	case -NFS4ERR_RESET_TO_MDS:
 	case -NFS4ERR_RESET_TO_MDS:
-		inode = data->lseg->pls_layout->plh_inode;
-		pnfs_error_mark_layout_for_return(inode, data->lseg);
-		if (err == -NFS4ERR_RESET_TO_PNFS)
-			pnfs_set_retry_layoutget(data->lseg->pls_layout);
-		else
-			pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+		pnfs_clear_retry_layoutget(data->lseg->pls_layout);
 		pnfs_generic_prepare_to_resend_writes(data);
 		pnfs_generic_prepare_to_resend_writes(data);
 		return -EAGAIN;
 		return -EAGAIN;
 	case -EAGAIN:
 	case -EAGAIN:
@@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
 					  struct nfs_pgio_header *hdr)
 					  struct nfs_pgio_header *hdr)
 {
 {
-	nfs4_ff_layout_stat_io_start_write(
+	nfs4_ff_layout_stat_io_start_write(hdr->inode,
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count);
+			hdr->args.count,
+			task->tk_start);
 
 
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		rpc_exit(task, -EIO);
@@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 static void ff_layout_commit_prepare_common(struct rpc_task *task,
 static void ff_layout_commit_prepare_common(struct rpc_task *task,
 		struct nfs_commit_data *cdata)
 		struct nfs_commit_data *cdata)
 {
 {
-	nfs4_ff_layout_stat_io_start_write(
+	nfs4_ff_layout_stat_io_start_write(cdata->inode,
 			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
 			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
-			0);
+			0, task->tk_start);
 }
 }
 
 
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
@@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
 	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 }
 }
 
 
-static bool
+static int
 ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
 ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
-			       struct pnfs_layout_segment *pls,
-			       int *dev_count, int dev_limit)
+			       struct pnfs_layout_hdr *lo,
+			       int dev_limit)
 {
 {
+	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *dev;
 	struct nfs4_deviceid_node *dev;
 	struct nfs42_layoutstat_devinfo *devinfo;
 	struct nfs42_layoutstat_devinfo *devinfo;
-	int i;
+	int i = 0;
 
 
-	for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
-		if (*dev_count >= dev_limit)
+	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+		if (i >= dev_limit)
 			break;
 			break;
-		mirror = FF_LAYOUT_COMP(pls, i);
-		if (!mirror || !mirror->mirror_ds)
+		if (!mirror->mirror_ds)
+			continue;
+		/* mirror refcount put in cleanup_layoutstats */
+		if (!atomic_inc_not_zero(&mirror->ref))
 			continue;
 			continue;
-		dev = FF_LAYOUT_DEVID_NODE(pls, i);
-		devinfo = &args->devinfo[*dev_count];
+		dev = &mirror->mirror_ds->id_node; 
+		devinfo = &args->devinfo[i];
 		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
 		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
-		devinfo->offset = pls->pls_range.offset;
-		devinfo->length = pls->pls_range.length;
-		/* well, we don't really know if IO is continuous or not! */
-		devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+		devinfo->offset = 0;
+		devinfo->length = NFS4_MAX_UINT64;
+		devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
 		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
 		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
-		devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+		devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
 		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
 		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
 		devinfo->layout_type = LAYOUT_FLEX_FILES;
 		devinfo->layout_type = LAYOUT_FLEX_FILES;
 		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
 		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
 		devinfo->layout_private = mirror;
 		devinfo->layout_private = mirror;
-		/* lseg refcount put in cleanup_layoutstats */
-		pnfs_get_lseg(pls);
 
 
-		++(*dev_count);
+		i++;
 	}
 	}
-
-	return *dev_count < dev_limit;
+	return i;
 }
 }
 
 
 static int
 static int
 ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 {
 {
-	struct pnfs_layout_segment *pls;
+	struct nfs4_flexfile_layout *ff_layout;
+	struct nfs4_ff_layout_mirror *mirror;
 	int dev_count = 0;
 	int dev_count = 0;
 
 
 	spin_lock(&args->inode->i_lock);
 	spin_lock(&args->inode->i_lock);
-	list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
-		dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+	ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+		if (atomic_read(&mirror->ref) != 0)
+			dev_count ++;
 	}
 	}
 	spin_unlock(&args->inode->i_lock);
 	spin_unlock(&args->inode->i_lock);
 	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
 	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 			__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
 			__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
 		dev_count = PNFS_LAYOUTSTATS_MAXDEV;
 		dev_count = PNFS_LAYOUTSTATS_MAXDEV;
 	}
 	}
-	args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
 	if (!args->devinfo)
 	if (!args->devinfo)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	dev_count = 0;
 	spin_lock(&args->inode->i_lock);
 	spin_lock(&args->inode->i_lock);
-	list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
-		if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
-						    PNFS_LAYOUTSTATS_MAXDEV)) {
-			break;
-		}
-	}
+	args->num_dev = ff_layout_mirror_prepare_stats(args,
+			&ff_layout->generic_hdr, dev_count);
 	spin_unlock(&args->inode->i_lock);
 	spin_unlock(&args->inode->i_lock);
-	args->num_dev = dev_count;
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
 	for (i = 0; i < data->args.num_dev; i++) {
 	for (i = 0; i < data->args.num_dev; i++) {
 		mirror = data->args.devinfo[i].layout_private;
 		mirror = data->args.devinfo[i].layout_private;
 		data->args.devinfo[i].layout_private = NULL;
 		data->args.devinfo[i].layout_private = NULL;
-		pnfs_put_lseg(mirror->lseg);
+		ff_layout_put_mirror(mirror);
 	}
 	}
 }
 }
 
 
@@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.free_layout_hdr	= ff_layout_free_layout_hdr,
 	.free_layout_hdr	= ff_layout_free_layout_hdr,
 	.alloc_lseg		= ff_layout_alloc_lseg,
 	.alloc_lseg		= ff_layout_alloc_lseg,
 	.free_lseg		= ff_layout_free_lseg,
 	.free_lseg		= ff_layout_free_lseg,
+	.add_lseg		= ff_layout_add_lseg,
 	.pg_read_ops		= &ff_layout_pg_read_ops,
 	.pg_read_ops		= &ff_layout_pg_read_ops,
 	.pg_write_ops		= &ff_layout_pg_write_ops,
 	.pg_write_ops		= &ff_layout_pg_write_ops,
 	.get_ds_info		= ff_layout_get_ds_info,
 	.get_ds_info		= ff_layout_get_ds_info,

+ 4 - 1
fs/nfs/flexfilelayout/flexfilelayout.h

@@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat {
 };
 };
 
 
 struct nfs4_ff_layout_mirror {
 struct nfs4_ff_layout_mirror {
-	struct pnfs_layout_segment	*lseg; /* back pointer */
+	struct pnfs_layout_hdr		*layout;
+	struct list_head		mirrors;
 	u32				ds_count;
 	u32				ds_count;
 	u32				efficiency;
 	u32				efficiency;
 	struct nfs4_ff_layout_ds	*mirror_ds;
 	struct nfs4_ff_layout_ds	*mirror_ds;
@@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror {
 	u32				uid;
 	u32				uid;
 	u32				gid;
 	u32				gid;
 	struct rpc_cred			*cred;
 	struct rpc_cred			*cred;
+	atomic_t			ref;
 	spinlock_t			lock;
 	spinlock_t			lock;
 	struct nfs4_ff_layoutstat	read_stat;
 	struct nfs4_ff_layoutstat	read_stat;
 	struct nfs4_ff_layoutstat	write_stat;
 	struct nfs4_ff_layoutstat	write_stat;
@@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment {
 struct nfs4_flexfile_layout {
 struct nfs4_flexfile_layout {
 	struct pnfs_layout_hdr generic_hdr;
 	struct pnfs_layout_hdr generic_hdr;
 	struct pnfs_ds_commit_info commit_info;
 	struct pnfs_ds_commit_info commit_info;
+	struct list_head	mirrors;
 	struct list_head	error_list; /* nfs4_ff_layout_ds_err */
 	struct list_head	error_list; /* nfs4_ff_layout_ds_err */
 };
 };
 
 

+ 63 - 19
fs/nfs/flexfilelayout/flexfilelayoutdev.c

@@ -172,6 +172,32 @@ out_err:
 	return NULL;
 	return NULL;
 }
 }
 
 
+static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
+		struct nfs4_deviceid_node *devid)
+{
+	nfs4_mark_deviceid_unavailable(devid);
+	if (!ff_layout_has_available_ds(lseg))
+		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+				lseg);
+}
+
+static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
+		struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror == NULL || mirror->mirror_ds == NULL) {
+		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+					lseg);
+		return false;
+	}
+	if (mirror->mirror_ds->ds == NULL) {
+		struct nfs4_deviceid_node *devid;
+		devid = &mirror->mirror_ds->id_node;
+		ff_layout_mark_devid_invalid(lseg, devid);
+		return false;
+	}
+	return true;
+}
+
 static u64
 static u64
 end_offset(u64 start, u64 len)
 end_offset(u64 start, u64 len)
 {
 {
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
 {
 {
 	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
 	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
 	struct nfs_fh *fh = NULL;
 	struct nfs_fh *fh = NULL;
-	struct nfs4_deviceid_node *devid;
 
 
-	if (mirror == NULL || mirror->mirror_ds == NULL ||
-	    mirror->mirror_ds->ds == NULL) {
-		printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+	if (!ff_layout_mirror_valid(lseg, mirror)) {
+		pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
 			__func__, mirror_idx);
 			__func__, mirror_idx);
-		if (mirror && mirror->mirror_ds) {
-			devid = &mirror->mirror_ds->id_node;
-			pnfs_generic_mark_devid_invalid(devid);
-		}
 		goto out;
 		goto out;
 	}
 	}
 
 
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 	unsigned int max_payload;
 	unsigned int max_payload;
 	rpc_authflavor_t flavor;
 	rpc_authflavor_t flavor;
 
 
-	if (mirror == NULL || mirror->mirror_ds == NULL ||
-	    mirror->mirror_ds->ds == NULL) {
-		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+	if (!ff_layout_mirror_valid(lseg, mirror)) {
+		pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
 			__func__, ds_idx);
 			__func__, ds_idx);
-		if (mirror && mirror->mirror_ds) {
-			devid = &mirror->mirror_ds->id_node;
-			pnfs_generic_mark_devid_invalid(devid);
-		}
 		goto out;
 		goto out;
 	}
 	}
 
 
@@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
 					   range->offset, range->length))
 					   range->offset, range->length))
 			continue;
 			continue;
 		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
 		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
-		 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+		 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+		 * + status(4) + opnum(4)
 		 */
 		 */
 		p = xdr_reserve_space(xdr,
 		p = xdr_reserve_space(xdr,
-				24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+				28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
 		if (unlikely(!p))
 		if (unlikely(!p))
 			return -ENOBUFS;
 			return -ENOBUFS;
 		p = xdr_encode_hyper(p, err->offset);
 		p = xdr_encode_hyper(p, err->offset);
 		p = xdr_encode_hyper(p, err->length);
 		p = xdr_encode_hyper(p, err->length);
 		p = xdr_encode_opaque_fixed(p, &err->stateid,
 		p = xdr_encode_opaque_fixed(p, &err->stateid,
 					    NFS4_STATEID_SIZE);
 					    NFS4_STATEID_SIZE);
+		/* Encode 1 error */
+		*p++ = cpu_to_be32(1);
 		p = xdr_encode_opaque_fixed(p, &err->deviceid,
 		p = xdr_encode_opaque_fixed(p, &err->deviceid,
 					    NFS4_DEVICEID4_SIZE);
 					    NFS4_DEVICEID4_SIZE);
 		*p++ = cpu_to_be32(err->status);
 		*p++ = cpu_to_be32(err->status);
@@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
 	return 0;
 	return 0;
 }
 }
 
 
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
 {
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *devid;
 	struct nfs4_deviceid_node *devid;
-	int idx;
+	u32 idx;
 
 
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
 		mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 	return false;
 	return false;
 }
 }
 
 
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_deviceid_node *devid;
+	u32 idx;
+
+	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+		mirror = FF_LAYOUT_COMP(lseg, idx);
+		if (!mirror || !mirror->mirror_ds)
+			return false;
+		devid = &mirror->mirror_ds->id_node;
+		if (ff_layout_test_devid_unavailable(devid))
+			return false;
+	}
+
+	return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_READ)
+		return  ff_read_layout_has_available_ds(lseg);
+	/* Note: RW layout needs all mirrors available */
+	return ff_rw_layout_has_available_ds(lseg);
+}
+
 module_param(dataserver_retrans, uint, 0644);
 module_param(dataserver_retrans, uint, 0644);
 MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
 MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
 			"retries a request before it attempts further "
 			"retries a request before it attempts further "

+ 34 - 27
fs/nfs/inode.c

@@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 {
 	struct inode *inode = d_inode(dentry);
 	struct inode *inode = d_inode(dentry);
 	struct nfs_fattr *fattr;
 	struct nfs_fattr *fattr;
-	int error = -ENOMEM;
+	int error = 0;
 
 
 	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
 	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
 
 
@@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		attr->ia_valid &= ~ATTR_MODE;
 		attr->ia_valid &= ~ATTR_MODE;
 
 
 	if (attr->ia_valid & ATTR_SIZE) {
 	if (attr->ia_valid & ATTR_SIZE) {
-		loff_t i_size;
-
 		BUG_ON(!S_ISREG(inode->i_mode));
 		BUG_ON(!S_ISREG(inode->i_mode));
 
 
-		i_size = i_size_read(inode);
-		if (attr->ia_size == i_size)
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+
+		if (attr->ia_size == i_size_read(inode))
 			attr->ia_valid &= ~ATTR_SIZE;
 			attr->ia_valid &= ~ATTR_SIZE;
-		else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
-			return -ETXTBSY;
 	}
 	}
 
 
 	/* Optimization: if the end result is no change, don't RPC */
 	/* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		nfs_sync_inode(inode);
 		nfs_sync_inode(inode);
 
 
 	fattr = nfs_alloc_fattr();
 	fattr = nfs_alloc_fattr();
-	if (fattr == NULL)
+	if (fattr == NULL) {
+		error = -ENOMEM;
 		goto out;
 		goto out;
+	}
+
 	/*
 	/*
 	 * Return any delegations if we're going to change ACLs
 	 * Return any delegations if we're going to change ACLs
 	 */
 	 */
@@ -759,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
  * @ctx: pointer to context
  * @ctx: pointer to context
  * @is_sync: is this a synchronous close
  * @is_sync: is this a synchronous close
  *
  *
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
  */
  */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 {
 {
+	struct nfs_inode *nfsi;
 	struct inode *inode;
 	struct inode *inode;
 	struct nfs_server *server;
 	struct nfs_server *server;
 
 
@@ -772,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	if (!is_sync)
 	if (!is_sync)
 		return;
 		return;
 	inode = d_inode(ctx->dentry);
 	inode = d_inode(ctx->dentry);
-	if (!list_empty(&NFS_I(inode)->open_files))
+	nfsi = NFS_I(inode);
+	if (inode->i_mapping->nrpages == 0)
+		return;
+	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+		return;
+	if (!list_empty(&nfsi->open_files))
 		return;
 		return;
 	server = NFS_SERVER(inode);
 	server = NFS_SERVER(inode);
 	if (server->flags & NFS_MOUNT_NOCTO)
 	if (server->flags & NFS_MOUNT_NOCTO)
@@ -844,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 }
 }
 EXPORT_SYMBOL_GPL(put_nfs_open_context);
 EXPORT_SYMBOL_GPL(put_nfs_open_context);
 
 
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+	__put_nfs_open_context(ctx, 1);
+}
+
 /*
 /*
  * Ensure that mmap has a recent RPC credential for use when writing out
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
  * shared pages
@@ -888,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 	return ctx;
 	return ctx;
 }
 }
 
 
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
 {
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
 
@@ -899,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp)
 		spin_lock(&inode->i_lock);
 		spin_lock(&inode->i_lock);
 		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
 		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode->i_lock);
-		__put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+		put_nfs_open_context_sync(ctx);
 	}
 	}
 }
 }
 
 
@@ -919,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp)
 	return 0;
 	return 0;
 }
 }
 
 
-int nfs_release(struct inode *inode, struct file *filp)
-{
-	nfs_file_clear_open_context(filp);
-	return 0;
-}
-
 /*
 /*
  * This function is called whenever some part of NFS notices that
  * This function is called whenever some part of NFS notices that
  * the cached attributes have to be refreshed.
  * the cached attributes have to be refreshed.
@@ -1273,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	return 0;
 	return 0;
 }
 }
 
 
-static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
-	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
-		return 0;
-	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
-}
-
 static atomic_long_t nfs_attr_generation_counter;
 static atomic_long_t nfs_attr_generation_counter;
 
 
 static unsigned long nfs_read_attr_generation_counter(void)
 static unsigned long nfs_read_attr_generation_counter(void)
@@ -1428,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
 	const struct nfs_inode *nfsi = NFS_I(inode);
 	const struct nfs_inode *nfsi = NFS_I(inode);
 
 
 	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
 	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
-		nfs_ctime_need_update(inode, fattr) ||
 		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 }
 
 
@@ -1491,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
 {
 {
 	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 
+	/*
+	 * Don't revalidate the pagecache if we hold a delegation, but do
+	 * force an attribute update
+	 */
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+		invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
+
 	if (S_ISDIR(inode->i_mode))
 	if (S_ISDIR(inode->i_mode))
 		invalid |= NFS_INO_INVALID_DATA;
 		invalid |= NFS_INO_INVALID_DATA;
 	nfs_set_cache_invalid(inode, invalid);
 	nfs_set_cache_invalid(inode, invalid);

+ 10 - 10
fs/nfs/internal.h

@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
 }
 }
 #endif
 #endif
 
 
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
 /* callback_xdr.c */
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
 extern struct svc_version nfs4_callback_version4;
@@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
 /* file.c */
 /* file.c */
 int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
 int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
 loff_t nfs_file_llseek(struct file *, loff_t, int);
 loff_t nfs_file_llseek(struct file *, loff_t, int);
-int nfs_file_flush(struct file *, fl_owner_t);
 ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
 ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
 ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
 ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
 			     size_t, unsigned int);
 			     size_t, unsigned int);
@@ -490,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list,
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 				 struct nfs_commit_info *cinfo);
 				 struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+		struct list_head *dst,
+		struct nfs_commit_info *cinfo);
 void nfs_request_remove_commit_list(struct nfs_page *req,
 void nfs_request_remove_commit_list(struct nfs_page *req,
 				    struct nfs_commit_info *cinfo);
 				    struct nfs_commit_info *cinfo);
 void nfs_init_cinfo(struct nfs_commit_info *cinfo,
 void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
  * Record the page as unstable and mark its inode as dirty.
  * Record the page as unstable and mark its inode as dirty.
  */
  */
 static inline
 static inline
-void nfs_mark_page_unstable(struct page *page)
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
 {
 {
-	struct inode *inode = page_file_mapping(page)->host;
+	if (!cinfo->dreq) {
+		struct inode *inode = page_file_mapping(page)->host;
 
 
-	inc_zone_page_state(page, NR_UNSTABLE_NFS);
-	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
-	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		inc_zone_page_state(page, NR_UNSTABLE_NFS);
+		inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	}
 }
 }
 
 
 /*
 /*

+ 1 - 0
fs/nfs/nfs3xdr.c

@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
 {
 {
 	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
 	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
 	encode_symlinkdata3(xdr, args);
 	encode_symlinkdata3(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
 }
 }
 
 
 /*
 /*

+ 0 - 2
fs/nfs/nfs42.h

@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
 				   struct nfs42_layoutstat_data *);
 				   struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];
 
 
 #endif /* __LINUX_FS_NFS_NFS4_2_H */
 #endif /* __LINUX_FS_NFS_NFS4_2_H */

+ 2 - 3
fs/nfs/nfs42xdr.c

@@ -238,8 +238,7 @@ out_overflow:
 	return -EIO;
 	return -EIO;
 }
 }
 
 
-static int decode_layoutstats(struct xdr_stream *xdr,
-			      struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
 {
 {
 	return decode_op_hdr(xdr, OP_LAYOUTSTATS);
 	return decode_op_hdr(xdr, OP_LAYOUTSTATS);
 }
 }
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
 		goto out;
 		goto out;
 	WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
 	WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
 	for (i = 0; i < res->num_dev; i++) {
 	for (i = 0; i < res->num_dev; i++) {
-		status = decode_layoutstats(xdr, res);
+		status = decode_layoutstats(xdr);
 		if (status)
 		if (status)
 			goto out;
 			goto out;
 	}
 	}

+ 1 - 3
fs/nfs/nfs4_fs.h

@@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
 int nfs41_discover_server_trunking(struct nfs_client *clp,
 int nfs41_discover_server_trunking(struct nfs_client *clp,
 			struct nfs_client **, struct rpc_cred *);
 			struct nfs_client **, struct rpc_cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
 #else
 #else
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
 {

+ 1 - 4
fs/nfs/nfs4client.c

@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
 		return false;
 		return false;
 
 
 	/* Match only the IP address, not the port number */
 	/* Match only the IP address, not the port number */
-	if (!nfs_sockaddr_match_ipaddr(addr, clap))
-		return false;
-
-	return true;
+	return rpc_cmp_addr(addr, clap);
 }
 }
 
 
 /*
 /*

+ 29 - 3
fs/nfs/nfs4file.c

@@ -6,7 +6,9 @@
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/falloc.h>
 #include <linux/falloc.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs.h>
+#include "delegation.h"
 #include "internal.h"
 #include "internal.h"
+#include "iostat.h"
 #include "fscache.h"
 #include "fscache.h"
 #include "pnfs.h"
 #include "pnfs.h"
 
 
@@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	struct inode *dir;
 	struct inode *dir;
 	unsigned openflags = filp->f_flags;
 	unsigned openflags = filp->f_flags;
 	struct iattr attr;
 	struct iattr attr;
-	int opened = 0;
 	int err;
 	int err;
 
 
 	/*
 	/*
@@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 		nfs_sync_inode(inode);
 		nfs_sync_inode(inode);
 	}
 	}
 
 
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
 	if (IS_ERR(inode)) {
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		err = PTR_ERR(inode);
 		switch (err) {
 		switch (err) {
@@ -100,6 +101,31 @@ out_drop:
 	goto out_put_ctx;
 	goto out_put_ctx;
 }
 }
 
 
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+	struct inode	*inode = file_inode(file);
+
+	dprintk("NFS: flush(%pD2)\n", file);
+
+	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+	if ((file->f_mode & FMODE_WRITE) == 0)
+		return 0;
+
+	/*
+	 * If we're holding a write delegation, then check if we're required
+	 * to flush the i/o on close. If not, then just start the i/o now.
+	 */
+	if (!nfs4_delegation_flush_on_close(inode))
+		return filemap_fdatawrite(file->f_mapping);
+
+	/* Flush writes to the server and return any errors */
+	return vfs_fsync(file, 0);
+}
+
 static int
 static int
 nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 {
@@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = {
 	.write_iter	= nfs_file_write,
 	.write_iter	= nfs_file_write,
 	.mmap		= nfs_file_mmap,
 	.mmap		= nfs_file_mmap,
 	.open		= nfs4_file_open,
 	.open		= nfs4_file_open,
-	.flush		= nfs_file_flush,
+	.flush		= nfs4_file_flush,
 	.release	= nfs_file_release,
 	.release	= nfs_file_release,
 	.fsync		= nfs4_file_fsync,
 	.fsync		= nfs4_file_fsync,
 	.lock		= nfs_lock,
 	.lock		= nfs_lock,

+ 2 - 12
fs/nfs/nfs4idmap.c

@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
 	.read		= user_read,
 	.read		= user_read,
 };
 };
 
 
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
 {
 {
 	struct cred *cred;
 	struct cred *cred;
 	struct key *keyring;
 	struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
 	return ret;
 	return ret;
 }
 }
 
 
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
 {
 {
 	key_revoke(id_resolver_cache->thread_keyring);
 	key_revoke(id_resolver_cache->thread_keyring);
 	unregister_key_type(&key_type_id_resolver);
 	unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
 	kfree(idmap);
 	kfree(idmap);
 }
 }
 
 
-int nfs_idmap_init(void)
-{
-	return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
-	nfs_idmap_quit_keyring();
-}
-
 static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
 static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
 				     struct idmap_msg *im,
 				     struct idmap_msg *im,
 				     struct rpc_pipe_msg *msg)
 				     struct rpc_pipe_msg *msg)

+ 84 - 52
fs/nfs/nfs4proc.c

@@ -586,7 +586,7 @@ out_unlock:
 	spin_unlock(&tbl->slot_tbl_lock);
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;
 	res->sr_slot = NULL;
 	if (send_new_highest_used_slotid)
 	if (send_new_highest_used_slotid)
-		nfs41_server_notify_highest_slotid_update(session->clp);
+		nfs41_notify_server(session->clp);
 }
 }
 
 
 int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -1150,7 +1150,8 @@ out:
 	return ret;
 	return ret;
 }
 }
 
 
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+		enum open_claim_type4 claim)
 {
 {
 	if (delegation == NULL)
 	if (delegation == NULL)
 		return 0;
 		return 0;
@@ -1158,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 		return 0;
 		return 0;
 	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
 	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
 		return 0;
 		return 0;
+	switch (claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_FH:
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+			break;
+	default:
+		return 0;
+	}
 	nfs_mark_delegation_referenced(delegation);
 	nfs_mark_delegation_referenced(delegation);
 	return 1;
 	return 1;
 }
 }
@@ -1220,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
 }
 }
 
 
 static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
 static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+		nfs4_stateid *arg_stateid,
 		nfs4_stateid *stateid, fmode_t fmode)
 		nfs4_stateid *stateid, fmode_t fmode)
 {
 {
 	clear_bit(NFS_O_RDWR_STATE, &state->flags);
 	clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1238,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
 	if (stateid == NULL)
 	if (stateid == NULL)
 		return;
 		return;
 	/* Handle races with OPEN */
 	/* Handle races with OPEN */
-	if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
-	    !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+	if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
+	    (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+	    !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
 		nfs_resync_open_stateid_locked(state);
 		nfs_resync_open_stateid_locked(state);
 		return;
 		return;
 	}
 	}
@@ -1248,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
 	nfs4_stateid_copy(&state->open_stateid, stateid);
 	nfs4_stateid_copy(&state->open_stateid, stateid);
 }
 }
 
 
-static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+	nfs4_stateid *arg_stateid,
+	nfs4_stateid *stateid, fmode_t fmode)
 {
 {
 	write_seqlock(&state->seqlock);
 	write_seqlock(&state->seqlock);
-	nfs_clear_open_stateid_locked(state, stateid, fmode);
+	nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
 	write_sequnlock(&state->seqlock);
 	write_sequnlock(&state->seqlock);
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
 		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
 		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -1376,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	struct nfs_delegation *delegation;
 	struct nfs_delegation *delegation;
 	int open_mode = opendata->o_arg.open_flags;
 	int open_mode = opendata->o_arg.open_flags;
 	fmode_t fmode = opendata->o_arg.fmode;
 	fmode_t fmode = opendata->o_arg.fmode;
+	enum open_claim_type4 claim = opendata->o_arg.claim;
 	nfs4_stateid stateid;
 	nfs4_stateid stateid;
 	int ret = -EAGAIN;
 	int ret = -EAGAIN;
 
 
@@ -1389,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		spin_unlock(&state->owner->so_lock);
 		spin_unlock(&state->owner->so_lock);
 		rcu_read_lock();
 		rcu_read_lock();
 		delegation = rcu_dereference(nfsi->delegation);
 		delegation = rcu_dereference(nfsi->delegation);
-		if (!can_open_delegated(delegation, fmode)) {
+		if (!can_open_delegated(delegation, fmode, claim)) {
 			rcu_read_unlock();
 			rcu_read_unlock();
 			break;
 			break;
 		}
 		}
@@ -1852,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_state_owner *sp = data->owner;
 	struct nfs4_state_owner *sp = data->owner;
 	struct nfs_client *clp = sp->so_server->nfs_client;
 	struct nfs_client *clp = sp->so_server->nfs_client;
+	enum open_claim_type4 claim = data->o_arg.claim;
 
 
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 		goto out_wait;
 		goto out_wait;
@@ -1866,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 			goto out_no_action;
 			goto out_no_action;
 		rcu_read_lock();
 		rcu_read_lock();
 		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
 		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
-		if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
-		    data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
-		    can_open_delegated(delegation, data->o_arg.fmode))
+		if (can_open_delegated(delegation, data->o_arg.fmode, claim))
 			goto unlock_no_action;
 			goto unlock_no_action;
 		rcu_read_unlock();
 		rcu_read_unlock();
 	}
 	}
 	/* Update client id. */
 	/* Update client id. */
 	data->o_arg.clientid = clp->cl_clientid;
 	data->o_arg.clientid = clp->cl_clientid;
-	switch (data->o_arg.claim) {
+	switch (claim) {
+	default:
+		break;
 	case NFS4_OPEN_CLAIM_PREVIOUS:
 	case NFS4_OPEN_CLAIM_PREVIOUS:
 	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
 	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
@@ -2294,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
  * fields corresponding to attributes that were used to store the verifier.
  * fields corresponding to attributes that were used to store the verifier.
  * Make sure we clobber those fields in the later setattr call
  * Make sure we clobber those fields in the later setattr call
  */
  */
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+				struct iattr *sattr, struct nfs4_label **label)
 {
 {
-	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+	const u32 *attrset = opendata->o_res.attrset;
+
+	if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
 	    !(sattr->ia_valid & ATTR_ATIME_SET))
 	    !(sattr->ia_valid & ATTR_ATIME_SET))
 		sattr->ia_valid |= ATTR_ATIME;
 		sattr->ia_valid |= ATTR_ATIME;
 
 
-	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+	if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
 	    !(sattr->ia_valid & ATTR_MTIME_SET))
 	    !(sattr->ia_valid & ATTR_MTIME_SET))
 		sattr->ia_valid |= ATTR_MTIME;
 		sattr->ia_valid |= ATTR_MTIME;
+
+	/* Except MODE, it seems harmless of setting twice. */
+	if ((attrset[1] & FATTR4_WORD1_MODE))
+		sattr->ia_valid &= ~ATTR_MODE;
+
+	if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+		*label = NULL;
 }
 }
 
 
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2425,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir,
 		goto err_free_label;
 		goto err_free_label;
 	state = ctx->state;
 	state = ctx->state;
 
 
-	if ((opendata->o_arg.open_flags & O_EXCL) &&
+	if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
 	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
 	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
-		nfs4_exclusive_attrset(opendata, sattr);
+		nfs4_exclusive_attrset(opendata, sattr, &label);
 
 
 		nfs_fattr_init(opendata->o_res.f_attr);
 		nfs_fattr_init(opendata->o_res.f_attr);
 		status = nfs4_do_setattr(state->inode, cred,
 		status = nfs4_do_setattr(state->inode, cred,
@@ -2439,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir,
 			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
 			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
 		}
 		}
 	}
 	}
-	if (opendata->file_created)
+	if (opened && opendata->file_created)
 		*opened |= FILE_CREATED;
 		*opened |= FILE_CREATED;
 
 
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -2661,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	switch (task->tk_status) {
 	switch (task->tk_status) {
 		case 0:
 		case 0:
 			res_stateid = &calldata->res.stateid;
 			res_stateid = &calldata->res.stateid;
-			if (calldata->arg.fmode == 0 && calldata->roc)
+			if (calldata->roc)
 				pnfs_roc_set_barrier(state->inode,
 				pnfs_roc_set_barrier(state->inode,
 						     calldata->roc_barrier);
 						     calldata->roc_barrier);
 			renew_lease(server, calldata->timestamp);
 			renew_lease(server, calldata->timestamp);
@@ -2684,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 				goto out_release;
 				goto out_release;
 			}
 			}
 	}
 	}
-	nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+	nfs_clear_open_stateid(state, &calldata->arg.stateid,
+			res_stateid, calldata->arg.fmode);
 out_release:
 out_release:
 	nfs_release_seqid(calldata->arg.seqid);
 	nfs_release_seqid(calldata->arg.seqid);
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2735,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		goto out_no_action;
 		goto out_no_action;
 	}
 	}
 
 
-	if (calldata->arg.fmode == 0) {
+	if (calldata->arg.fmode == 0)
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
-		if (calldata->roc &&
-		    pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
-			nfs_release_seqid(calldata->arg.seqid);
-			goto out_wait;
-		    }
-	}
+	if (calldata->roc)
+		pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
 	calldata->arg.share_access =
 	calldata->arg.share_access =
 		nfs4_map_atomic_open_share(NFS_SERVER(inode),
 		nfs4_map_atomic_open_share(NFS_SERVER(inode),
 				calldata->arg.fmode, 0);
 				calldata->arg.fmode, 0);
@@ -2883,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 {
+	u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
 	struct nfs4_server_caps_arg args = {
 	struct nfs4_server_caps_arg args = {
 		.fhandle = fhandle,
 		.fhandle = fhandle,
+		.bitmask = bitmask,
 	};
 	};
 	struct nfs4_server_caps_res res = {};
 	struct nfs4_server_caps_res res = {};
 	struct rpc_message msg = {
 	struct rpc_message msg = {
@@ -2894,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	};
 	};
 	int status;
 	int status;
 
 
+	bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+		     FATTR4_WORD0_FH_EXPIRE_TYPE |
+		     FATTR4_WORD0_LINK_SUPPORT |
+		     FATTR4_WORD0_SYMLINK_SUPPORT |
+		     FATTR4_WORD0_ACLSUPPORT;
+	if (minorversion)
+		bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
 	if (status == 0) {
 		/* Sanity check the server answers */
 		/* Sanity check the server answers */
-		switch (server->nfs_client->cl_minorversion) {
+		switch (minorversion) {
 		case 0:
 		case 0:
 			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
 			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
 			res.attr_bitmask[2] = 0;
 			res.attr_bitmask[2] = 0;
@@ -2950,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->cache_consistency_bitmask[2] = 0;
 		server->cache_consistency_bitmask[2] = 0;
+		memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+			sizeof(server->exclcreat_bitmask));
 		server->acl_bitmask = res.acl_bitmask;
 		server->acl_bitmask = res.acl_bitmask;
 		server->fh_expire_type = res.fh_expire_type;
 		server->fh_expire_type = res.fh_expire_type;
 	}
 	}
@@ -3552,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	struct nfs4_label l, *ilabel = NULL;
 	struct nfs4_label l, *ilabel = NULL;
 	struct nfs_open_context *ctx;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
 	struct nfs4_state *state;
-	int opened = 0;
 	int status = 0;
 	int status = 0;
 
 
 	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
 	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3562,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
 
 	sattr->ia_mode &= ~current_umask();
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
 	if (IS_ERR(state)) {
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		status = PTR_ERR(state);
 		goto out;
 		goto out;
@@ -4978,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 	int result;
 	int result;
 	size_t len;
 	size_t len;
 	char *str;
 	char *str;
-	bool retried = false;
 
 
 	if (clp->cl_owner_id != NULL)
 	if (clp->cl_owner_id != NULL)
 		return 0;
 		return 0;
-retry:
+
 	rcu_read_lock();
 	rcu_read_lock();
-	len = 10 + strlen(clp->cl_ipaddr) + 1 +
+	len = 14 + strlen(clp->cl_ipaddr) + 1 +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
 		1 +
 		1 +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
@@ -5010,14 +5045,6 @@ retry:
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
 	rcu_read_unlock();
 	rcu_read_unlock();
 
 
-	/* Did something change? */
-	if (result >= len) {
-		kfree(str);
-		if (retried)
-			return -EINVAL;
-		retried = true;
-		goto retry;
-	}
 	clp->cl_owner_id = str;
 	clp->cl_owner_id = str;
 	return 0;
 	return 0;
 }
 }
@@ -5049,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 			clp->rpc_ops->version, clp->cl_minorversion,
 			clp->rpc_ops->version, clp->cl_minorversion,
 			nfs4_client_id_uniquifier,
 			nfs4_client_id_uniquifier,
 			clp->cl_rpcclient->cl_nodename);
 			clp->cl_rpcclient->cl_nodename);
-	if (result >= len) {
-		kfree(str);
-		return -EINVAL;
-	}
 	clp->cl_owner_id = str;
 	clp->cl_owner_id = str;
 	return 0;
 	return 0;
 }
 }
@@ -5088,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
 	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
 	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
 			clp->rpc_ops->version, clp->cl_minorversion,
 			clp->rpc_ops->version, clp->cl_minorversion,
 			clp->cl_rpcclient->cl_nodename);
 			clp->cl_rpcclient->cl_nodename);
-	if (result >= len) {
-		kfree(str);
-		return -EINVAL;
-	}
 	clp->cl_owner_id = str;
 	clp->cl_owner_id = str;
 	return 0;
 	return 0;
 }
 }
@@ -5289,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
 
 	d_data = (struct nfs4_delegreturndata *)data;
 	d_data = (struct nfs4_delegreturndata *)data;
 
 
-	if (d_data->roc &&
-	    pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
-		return;
+	if (d_data->roc)
+		pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
 
 
 	nfs4_setup_sequence(d_data->res.server,
 	nfs4_setup_sequence(d_data->res.server,
 			&d_data->args.seq_args,
 			&d_data->args.seq_args,
@@ -7745,11 +7763,20 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	switch (task->tk_status) {
 	case 0:
 	case 0:
 		goto out;
 		goto out;
+	/*
+	 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+	 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+	 */
+	case -NFS4ERR_BADLAYOUT:
+		goto out_overflow;
 	/*
 	/*
 	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
 	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
-	 * (or clients) writing to the same RAID stripe
+	 * (or clients) writing to the same RAID stripe except when
+	 * the minlength argument is 0 (see RFC5661 section 18.43.3).
 	 */
 	 */
 	case -NFS4ERR_LAYOUTTRYLATER:
 	case -NFS4ERR_LAYOUTTRYLATER:
+		if (lgp->args.minlength == 0)
+			goto out_overflow;
 	/*
 	/*
 	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
 	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
 	 * existing layout before getting a new one).
 	 * existing layout before getting a new one).
@@ -7805,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 		rpc_restart_call_prepare(task);
 		rpc_restart_call_prepare(task);
 out:
 out:
 	dprintk("<-- %s\n", __func__);
 	dprintk("<-- %s\n", __func__);
+	return;
+out_overflow:
+	task->tk_status = -EOVERFLOW;
+	goto out;
 }
 }
 
 
 static size_t max_response_pages(struct nfs_server *server)
 static size_t max_response_pages(struct nfs_server *server)
@@ -8661,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
+	.mig_recovery_ops = &nfs41_mig_recovery_ops,
 };
 };
 #endif
 #endif
 
 

+ 1 - 11
fs/nfs/nfs4state.c

@@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 }
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
 
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
 {
 {
 	/* Use CHECK_LEASE to ping the server with a SEQUENCE */
 	/* Use CHECK_LEASE to ping the server with a SEQUENCE */
 	set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 	set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 	nfs4_schedule_state_manager(clp);
 	nfs4_schedule_state_manager(clp);
 }
 }
 
 
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
-	nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
-	nfs41_ping_server(clp);
-}
-
 static void nfs4_reset_all_state(struct nfs_client *clp)
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {

+ 61 - 0
fs/nfs/nfs4trace.h

@@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
 
 
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+		TP_PROTO(
+			const struct nfs_client *clp,
+			const struct nfs_fh *fhandle,
+			const struct inode *inode,
+			int error
+		),
+
+		TP_ARGS(clp, fhandle, inode, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__string(dstaddr, clp ?
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR) : "unknown")
+		),
+
+		TP_fast_assign(
+			__entry->error = error;
+			__entry->fhandle = nfs_fhandle_hash(fhandle);
+			if (inode != NULL) {
+				__entry->fileid = NFS_FILEID(inode);
+				__entry->dev = inode->i_sb->s_dev;
+			} else {
+				__entry->fileid = 0;
+				__entry->dev = 0;
+			}
+			__assign_str(dstaddr, clp ?
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR) : "unknown")
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"dstaddr=%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__get_str(dstaddr)
+		)
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+	DEFINE_EVENT(nfs4_inode_callback_event, name, \
+			TP_PROTO( \
+				const struct nfs_client *clp, \
+				const struct nfs_fh *fhandle, \
+				const struct inode *inode, \
+				int error \
+			), \
+			TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+
+
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
 		TP_PROTO(
 		TP_PROTO(
 			const char *name,
 			const char *name,
@@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget,
 
 
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 
 
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4_1 */
 
 

+ 54 - 21
fs/nfs/nfs4xdr.c

@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
 				encode_stateid_maxsz + \
 				encode_stateid_maxsz + \
-				1 /* FIXME: opaque lrf_body always empty at the moment */)
+				1 + \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
 #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
 #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
 				1 + decode_stateid_maxsz)
 				1 + decode_stateid_maxsz)
 #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
 #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
@@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
 
 
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 				const struct nfs4_label *label,
 				const struct nfs4_label *label,
-				const struct nfs_server *server)
+				const struct nfs_server *server,
+				bool excl_check)
 {
 {
 	char owner_name[IDMAP_NAMESZ];
 	char owner_name[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
 		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
 		len += 4;
 		len += 4;
 	}
 	}
+
+	if (excl_check) {
+		const u32 *excl_bmval = server->exclcreat_bitmask;
+		bmval[0] &= excl_bmval[0];
+		bmval[1] &= excl_bmval[1];
+		bmval[2] &= excl_bmval[2];
+
+		if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+			label = NULL;
+	}
+
 	if (label) {
 	if (label) {
 		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
 		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
 		bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
 		bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	case NF4LNK:
 	case NF4LNK:
 		p = reserve_space(xdr, 4);
 		p = reserve_space(xdr, 4);
 		*p = cpu_to_be32(create->u.symlink.len);
 		*p = cpu_to_be32(create->u.symlink.len);
-		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+		xdr_write_pages(xdr, create->u.symlink.pages, 0,
+				create->u.symlink.len);
+		xdr->buf->flags |= XDRBUF_WRITE;
 		break;
 		break;
 
 
 	case NF4BLK: case NF4CHR:
 	case NF4BLK: case NF4CHR:
@@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 	}
 
 
 	encode_string(xdr, create->name->len, create->name->name);
 	encode_string(xdr, create->name->len, create->name->name);
-	encode_attrs(xdr, create->attrs, create->label, create->server);
+	encode_attrs(xdr, create->attrs, create->label, create->server, false);
 }
 }
 
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
 {
-	struct iattr dummy;
 	__be32 *p;
 	__be32 *p;
 
 
 	p = reserve_space(xdr, 4);
 	p = reserve_space(xdr, 4);
 	switch(arg->createmode) {
 	switch(arg->createmode) {
 	case NFS4_CREATE_UNCHECKED:
 	case NFS4_CREATE_UNCHECKED:
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
 		break;
 		break;
 	case NFS4_CREATE_GUARDED:
 	case NFS4_CREATE_GUARDED:
 		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
 		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
 		break;
 		break;
 	case NFS4_CREATE_EXCLUSIVE:
 	case NFS4_CREATE_EXCLUSIVE:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	case NFS4_CREATE_EXCLUSIVE4_1:
 	case NFS4_CREATE_EXCLUSIVE4_1:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
-		dummy.ia_valid = 0;
-		encode_attrs(xdr, &dummy, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
 	}
 	}
 }
 }
 
 
@@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 {
 	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
 	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &arg->stateid);
 	encode_nfs4_stateid(xdr, &arg->stateid);
-	encode_attrs(xdr, arg->iap, arg->label, server);
+	encode_attrs(xdr, arg->iap, arg->label, server, false);
 }
 }
 
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
 				     struct xdr_stream *xdr,
 				     struct xdr_stream *xdr,
 				     struct nfs4_server_caps_arg *args)
 				     struct nfs4_server_caps_arg *args)
 {
 {
+	const u32 *bitmask = args->bitmask;
 	struct compound_hdr hdr = {
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 	};
@@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
-	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-			   FATTR4_WORD0_FH_EXPIRE_TYPE|
-			   FATTR4_WORD0_LINK_SUPPORT|
-			   FATTR4_WORD0_SYMLINK_SUPPORT|
-			   FATTR4_WORD0_ACLSUPPORT, &hdr);
+	encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
 	encode_nops(&hdr);
 	encode_nops(&hdr);
 }
 }
 
 
@@ -3368,6 +3378,22 @@ out_overflow:
 	return -EIO;
 	return -EIO;
 }
 }
 
 
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+				 uint32_t *bitmap, uint32_t *bitmask)
+{
+	if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
+		bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+	} else
+		bitmask[0] = bitmask[1] = bitmask[2] = 0;
+	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+		bitmask[0], bitmask[1], bitmask[2]);
+	return 0;
+}
+
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 {
 {
 	__be32 *p;
 	__be32 *p;
@@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
 		goto xdr_error;
 		goto xdr_error;
 	if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
 	if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
 		goto xdr_error;
 		goto xdr_error;
+	if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+				res->exclcreat_bitmask)) != 0)
+		goto xdr_error;
 	status = verify_attr_len(xdr, savep, attrlen);
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
 xdr_error:
 	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
 }
 }
 
 
 /* This is too sick! */
 /* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+		unsigned long *pagemod_limit)
 {
 {
 	__be32 *p;
 	__be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
 	uint32_t limit_type, nblocks, blocksize;
+	u64 maxsize = 0;
 
 
 	p = xdr_inline_decode(xdr, 12);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 	if (unlikely(!p))
 		goto out_overflow;
 		goto out_overflow;
 	limit_type = be32_to_cpup(p++);
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	switch (limit_type) {
-	case 1:
-		xdr_decode_hyper(p, maxsize);
+	case NFS4_LIMIT_SIZE:
+		xdr_decode_hyper(p, &maxsize);
 		break;
 		break;
-	case 2:
+	case NFS4_LIMIT_BLOCKS:
 		nblocks = be32_to_cpup(p++);
 		nblocks = be32_to_cpup(p++);
 		blocksize = be32_to_cpup(p);
 		blocksize = be32_to_cpup(p);
-		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+		maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	}
+	maxsize >>= PAGE_CACHE_SHIFT;
+	*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
 	return 0;
 	return 0;
 out_overflow:
 out_overflow:
 	print_overflow_msg(__func__, xdr);
 	print_overflow_msg(__func__, xdr);
@@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
 		break;
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
 	case NFS4_OPEN_DELEGATE_WRITE:
 		res->delegation_type = FMODE_WRITE|FMODE_READ;
 		res->delegation_type = FMODE_WRITE|FMODE_READ;
-		if (decode_space_limit(xdr, &res->maxsize) < 0)
+		if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
 				return -EIO;
 				return -EIO;
 	}
 	}
 	return decode_ace(xdr, NULL, res->server->nfs_client);
 	return decode_ace(xdr, NULL, res->server->nfs_client);

+ 2 - 2
fs/nfs/pagelist.c

@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 {
 {
 	spin_lock(&hdr->lock);
 	spin_lock(&hdr->lock);
-	if (pos < hdr->io_start + hdr->good_bytes) {
-		set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+	if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+	    || pos < hdr->io_start + hdr->good_bytes) {
 		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
 		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
 		hdr->good_bytes = pos - hdr->io_start;
 		hdr->good_bytes = pos - hdr->io_start;
 		hdr->error = error;
 		hdr->error = error;

+ 135 - 92
fs/nfs/pnfs.c

@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
 	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 		return false;
 		return false;
 	lo->plh_return_iomode = 0;
 	lo->plh_return_iomode = 0;
-	lo->plh_block_lgets++;
 	pnfs_get_layout_hdr(lo);
 	pnfs_get_layout_hdr(lo);
 	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
 	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
 	return true;
 	return true;
@@ -817,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 }
 }
 
 
-static bool
-pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
-		      struct pnfs_layout_range *range)
-{
-	return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
-		(lo->plh_return_iomode == IOMODE_ANY ||
-		 lo->plh_return_iomode == range->iomode);
-}
-
 /* lget is set to 1 if called from inside send_layoutget call chain */
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
 static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
-			struct pnfs_layout_range *range, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 {
 {
 	return lo->plh_block_lgets ||
 	return lo->plh_block_lgets ||
-		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-		(list_empty(&lo->plh_segs) &&
-		 (atomic_read(&lo->plh_outstanding) > lget)) ||
-		pnfs_layout_returning(lo, range);
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 }
 
 
 int
 int
@@ -847,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 
 
 	dprintk("--> %s\n", __func__);
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
 	spin_lock(&lo->plh_inode->i_lock);
-	if (pnfs_layoutgets_blocked(lo, range, 1)) {
+	if (pnfs_layoutgets_blocked(lo)) {
 		status = -EAGAIN;
 		status = -EAGAIN;
 	} else if (!nfs4_valid_open_stateid(open_state)) {
 	} else if (!nfs4_valid_open_stateid(open_state)) {
 		status = -EBADF;
 		status = -EBADF;
@@ -882,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs4_layoutget *lgp;
 	struct nfs4_layoutget *lgp;
 	struct pnfs_layout_segment *lseg;
 	struct pnfs_layout_segment *lseg;
+	loff_t i_size;
 
 
 	dprintk("--> %s\n", __func__);
 	dprintk("--> %s\n", __func__);
 
 
@@ -889,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	if (lgp == NULL)
 	if (lgp == NULL)
 		return NULL;
 		return NULL;
 
 
+	i_size = i_size_read(ino);
+
 	lgp->args.minlength = PAGE_CACHE_SIZE;
 	lgp->args.minlength = PAGE_CACHE_SIZE;
 	if (lgp->args.minlength > range->length)
 	if (lgp->args.minlength > range->length)
 		lgp->args.minlength = range->length;
 		lgp->args.minlength = range->length;
+	if (range->iomode == IOMODE_READ) {
+		if (range->offset >= i_size)
+			lgp->args.minlength = 0;
+		else if (i_size - range->offset < lgp->args.minlength)
+			lgp->args.minlength = i_size - range->offset;
+	}
 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 	lgp->args.range = *range;
 	lgp->args.range = *range;
 	lgp->args.type = server->pnfs_curr_ld->id;
 	lgp->args.type = server->pnfs_curr_ld->id;
@@ -956,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 	if (unlikely(lrp == NULL)) {
 	if (unlikely(lrp == NULL)) {
 		status = -ENOMEM;
 		status = -ENOMEM;
 		spin_lock(&ino->i_lock);
 		spin_lock(&ino->i_lock);
-		lo->plh_block_lgets--;
 		pnfs_clear_layoutreturn_waitbit(lo);
 		pnfs_clear_layoutreturn_waitbit(lo);
-		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 		spin_unlock(&ino->i_lock);
 		spin_unlock(&ino->i_lock);
 		pnfs_put_layout_hdr(lo);
 		pnfs_put_layout_hdr(lo);
 		goto out;
 		goto out;
@@ -1080,15 +1073,14 @@ bool pnfs_roc(struct inode *ino)
 	struct pnfs_layout_segment *lseg, *tmp;
 	struct pnfs_layout_segment *lseg, *tmp;
 	nfs4_stateid stateid;
 	nfs4_stateid stateid;
 	LIST_HEAD(tmp_list);
 	LIST_HEAD(tmp_list);
-	bool found = false, layoutreturn = false;
+	bool found = false, layoutreturn = false, roc = false;
 
 
 	spin_lock(&ino->i_lock);
 	spin_lock(&ino->i_lock);
 	lo = nfsi->layout;
 	lo = nfsi->layout;
-	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
-	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+	if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
 		goto out_noroc;
 		goto out_noroc;
 
 
-	/* Don't return layout if we hold a delegation */
+	/* no roc if we hold a delegation */
 	if (nfs4_check_delegation(ino, FMODE_READ))
 	if (nfs4_check_delegation(ino, FMODE_READ))
 		goto out_noroc;
 		goto out_noroc;
 
 
@@ -1099,34 +1091,41 @@ bool pnfs_roc(struct inode *ino)
 			goto out_noroc;
 			goto out_noroc;
 	}
 	}
 
 
+	stateid = lo->plh_stateid;
+	/* always send layoutreturn if being marked so */
+	if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+				   &lo->plh_flags))
+		layoutreturn = pnfs_prepare_layoutreturn(lo);
+
 	pnfs_clear_retry_layoutget(lo);
 	pnfs_clear_retry_layoutget(lo);
 	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
 	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
-		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+		/* If we are sending layoutreturn, invalidate all valid lsegs */
+		if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
 			mark_lseg_invalid(lseg, &tmp_list);
 			mark_lseg_invalid(lseg, &tmp_list);
 			found = true;
 			found = true;
 		}
 		}
-	if (!found)
-		goto out_noroc;
-	lo->plh_block_lgets++;
-	pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
-	spin_unlock(&ino->i_lock);
-	pnfs_free_lseg_list(&tmp_list);
-	pnfs_layoutcommit_inode(ino, true);
-	return true;
+	/* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
+	 * in pnfs_roc_release(). We don't really send a layoutreturn but
+	 * still want others to view us like we are sending one!
+	 *
+	 * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
+	 * LAYOUTRETURN, so we proceed like there are no layouts to return.
+	 *
+	 * ROC in three conditions:
+	 * 1. there are ROC lsegs
+	 * 2. we don't send layoutreturn
+	 * 3. no others are sending layoutreturn
+	 */
+	if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
+		roc = true;
 
 
 out_noroc:
 out_noroc:
-	if (lo) {
-		stateid = lo->plh_stateid;
-		if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					   &lo->plh_flags))
-			layoutreturn = pnfs_prepare_layoutreturn(lo);
-	}
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
-	if (layoutreturn) {
-		pnfs_layoutcommit_inode(ino, true);
+	pnfs_free_lseg_list(&tmp_list);
+	pnfs_layoutcommit_inode(ino, true);
+	if (layoutreturn)
 		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
 		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
-	}
-	return false;
+	return roc;
 }
 }
 
 
 void pnfs_roc_release(struct inode *ino)
 void pnfs_roc_release(struct inode *ino)
@@ -1135,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino)
 
 
 	spin_lock(&ino->i_lock);
 	spin_lock(&ino->i_lock);
 	lo = NFS_I(ino)->layout;
 	lo = NFS_I(ino)->layout;
-	lo->plh_block_lgets--;
+	pnfs_clear_layoutreturn_waitbit(lo);
 	if (atomic_dec_and_test(&lo->plh_refcount)) {
 	if (atomic_dec_and_test(&lo->plh_refcount)) {
 		pnfs_detach_layout_hdr(lo);
 		pnfs_detach_layout_hdr(lo);
 		spin_unlock(&ino->i_lock);
 		spin_unlock(&ino->i_lock);
@@ -1153,27 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
 	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
 		lo->plh_barrier = barrier;
 		lo->plh_barrier = barrier;
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
+	trace_nfs4_layoutreturn_on_close(ino, 0);
 }
 }
 
 
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_hdr *lo;
-	struct pnfs_layout_segment *lseg;
-	nfs4_stateid stateid;
 	u32 current_seqid;
 	u32 current_seqid;
-	bool layoutreturn = false;
 
 
 	spin_lock(&ino->i_lock);
 	spin_lock(&ino->i_lock);
-	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
-		if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
-			continue;
-		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
-			continue;
-		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-		spin_unlock(&ino->i_lock);
-		return true;
-	}
 	lo = nfsi->layout;
 	lo = nfsi->layout;
 	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 
 
@@ -1181,19 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 	 * a barrier, we choose the worst-case barrier.
 	 * a barrier, we choose the worst-case barrier.
 	 */
 	 */
 	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
 	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-	stateid = lo->plh_stateid;
-	if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					   &lo->plh_flags))
-		layoutreturn = pnfs_prepare_layoutreturn(lo);
-	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
-	if (layoutreturn) {
-		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
-		return true;
-	}
-	return false;
 }
 }
 
 
 /*
 /*
@@ -1221,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
 	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 }
 }
 
 
-static void
-pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
-		   struct pnfs_layout_segment *lseg)
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+		const struct pnfs_layout_range *l2)
 {
 {
-	struct pnfs_layout_segment *lp;
+	return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+		struct pnfs_layout_segment *old)
+{
+	return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg,
+		   bool (*is_after)(const struct pnfs_layout_range *,
+			   const struct pnfs_layout_range *),
+		   bool (*do_merge)(struct pnfs_layout_segment *,
+			   struct pnfs_layout_segment *),
+		   struct list_head *free_me)
+{
+	struct pnfs_layout_segment *lp, *tmp;
 
 
 	dprintk("%s:Begin\n", __func__);
 	dprintk("%s:Begin\n", __func__);
 
 
-	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-		if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+	list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+		if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+			continue;
+		if (do_merge(lseg, lp)) {
+			mark_lseg_invalid(lp, free_me);
+			continue;
+		}
+		if (is_after(&lseg->pls_range, &lp->pls_range))
 			continue;
 			continue;
 		list_add_tail(&lseg->pls_list, &lp->pls_list);
 		list_add_tail(&lseg->pls_list, &lp->pls_list);
 		dprintk("%s: inserted lseg %p "
 		dprintk("%s: inserted lseg %p "
@@ -1252,6 +1253,24 @@ out:
 
 
 	dprintk("%s:Return\n", __func__);
 	dprintk("%s:Return\n", __func__);
 }
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg,
+		   struct list_head *free_me)
+{
+	struct inode *inode = lo->plh_inode;
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld->add_lseg != NULL)
+		ld->add_lseg(lo, lseg, free_me);
+	else
+		pnfs_generic_layout_insert_lseg(lo, lseg,
+				pnfs_lseg_range_is_after,
+				pnfs_lseg_no_merge,
+				free_me);
+}
 
 
 static struct pnfs_layout_hdr *
 static struct pnfs_layout_hdr *
 alloc_init_layout_hdr(struct inode *ino,
 alloc_init_layout_hdr(struct inode *ino,
@@ -1344,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 			ret = pnfs_get_lseg(lseg);
 			ret = pnfs_get_lseg(lseg);
 			break;
 			break;
 		}
 		}
-		if (lseg->pls_range.offset > range->offset)
-			break;
 	}
 	}
 
 
 	dprintk("%s:Return lseg %p ref %d\n",
 	dprintk("%s:Return lseg %p ref %d\n",
@@ -1438,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
 
 
 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 {
 {
+	if (!pnfs_should_retry_layoutget(lo))
+		return false;
 	/*
 	/*
 	 * send layoutcommit as it can hold up layoutreturn due to lseg
 	 * send layoutcommit as it can hold up layoutreturn due to lseg
 	 * reference
 	 * reference
@@ -1484,6 +1503,9 @@ pnfs_update_layout(struct inode *ino,
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		goto out;
 		goto out;
 
 
+	if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+		goto out;
+
 	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
 	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
 		goto out;
 		goto out;
 
 
@@ -1533,8 +1555,7 @@ lookup_again:
 	 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
 	 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
 	 * for LAYOUTRETURN even if first is true.
 	 * for LAYOUTRETURN even if first is true.
 	 */
 	 */
-	if (!lseg && pnfs_should_retry_layoutget(lo) &&
-	    test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
 		spin_unlock(&ino->i_lock);
 		spin_unlock(&ino->i_lock);
 		dprintk("%s wait for layoutreturn\n", __func__);
 		dprintk("%s wait for layoutreturn\n", __func__);
 		if (pnfs_prepare_to_retry_layoutget(lo)) {
 		if (pnfs_prepare_to_retry_layoutget(lo)) {
@@ -1547,7 +1568,7 @@ lookup_again:
 		goto out_put_layout_hdr;
 		goto out_put_layout_hdr;
 	}
 	}
 
 
-	if (pnfs_layoutgets_blocked(lo, &arg, 0))
+	if (pnfs_layoutgets_blocked(lo))
 		goto out_unlock;
 		goto out_unlock;
 	atomic_inc(&lo->plh_outstanding);
 	atomic_inc(&lo->plh_outstanding);
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
@@ -1593,6 +1614,26 @@ out_unlock:
 }
 }
 EXPORT_SYMBOL_GPL(pnfs_update_layout);
 EXPORT_SYMBOL_GPL(pnfs_update_layout);
 
 
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+	switch (range->iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+		break;
+	default:
+		return false;
+	}
+	if (range->offset == NFS4_MAX_UINT64)
+		return false;
+	if (range->length == 0)
+		return false;
+	if (range->length != NFS4_MAX_UINT64 &&
+	    range->length > NFS4_MAX_UINT64 - range->offset)
+		return false;
+	return true;
+}
+
 struct pnfs_layout_segment *
 struct pnfs_layout_segment *
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
 {
@@ -1601,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	struct pnfs_layout_segment *lseg;
 	struct pnfs_layout_segment *lseg;
 	struct inode *ino = lo->plh_inode;
 	struct inode *ino = lo->plh_inode;
 	LIST_HEAD(free_me);
 	LIST_HEAD(free_me);
-	int status = 0;
+	int status = -EINVAL;
+
+	if (!pnfs_sanity_check_layout_range(&res->range))
+		goto out;
 
 
 	/* Inject layout blob into I/O device driver */
 	/* Inject layout blob into I/O device driver */
 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
@@ -1619,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	lseg->pls_range = res->range;
 	lseg->pls_range = res->range;
 
 
 	spin_lock(&ino->i_lock);
 	spin_lock(&ino->i_lock);
-	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-		dprintk("%s forget reply due to recall\n", __func__);
-		goto out_forget_reply;
-	}
-
-	if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+	if (pnfs_layoutgets_blocked(lo)) {
 		dprintk("%s forget reply due to state\n", __func__);
 		dprintk("%s forget reply due to state\n", __func__);
 		goto out_forget_reply;
 		goto out_forget_reply;
 	}
 	}
@@ -1651,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 
 
 	pnfs_get_lseg(lseg);
 	pnfs_get_lseg(lseg);
-	pnfs_layout_insert_lseg(lo, lseg);
+	pnfs_layout_insert_lseg(lo, lseg, &free_me);
 
 
-	if (res->return_on_close) {
+	if (res->return_on_close)
 		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
 		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
-		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
-	}
 
 
 	spin_unlock(&ino->i_lock);
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me);
 	pnfs_free_lseg_list(&free_me);
@@ -1692,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 				lseg->pls_range.length);
 				lseg->pls_range.length);
 			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
 			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
 			mark_lseg_invalid(lseg, tmp_list);
 			mark_lseg_invalid(lseg, tmp_list);
+			set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					&lo->plh_flags);
 		}
 		}
 }
 }
 
 
@@ -2267,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
 #if IS_ENABLED(CONFIG_NFS_V4_2)
 int
 int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 {
 {
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode)
 	pnfs_get_layout_hdr(hdr);
 	pnfs_get_layout_hdr(hdr);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode->i_lock);
 
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), gfp_flags);
 	if (!data) {
 	if (!data) {
 		status = -ENOMEM;
 		status = -ENOMEM;
 		goto out_put;
 		goto out_put;
@@ -2324,3 +2363,7 @@ out_put:
 }
 }
 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
 #endif
 #endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);

+ 38 - 10
fs/nfs/pnfs.h

@@ -94,7 +94,6 @@ enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
-	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
 	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
 	NFS_LAYOUT_RETURN_BEFORE_CLOSE,	/* Return this layout before close */
 	NFS_LAYOUT_RETURN_BEFORE_CLOSE,	/* Return this layout before close */
 	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
 	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
@@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type {
 
 
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+	void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+			struct pnfs_layout_segment *lseg,
+			struct list_head *free_me);
 
 
 	void (*return_range) (struct pnfs_layout_hdr *lo,
 	void (*return_range) (struct pnfs_layout_hdr *lo,
 			      struct pnfs_layout_range *range);
 			      struct pnfs_layout_range *range);
@@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type {
 
 
 struct pnfs_layout_hdr {
 struct pnfs_layout_hdr {
 	atomic_t		plh_refcount;
 	atomic_t		plh_refcount;
+	atomic_t		plh_outstanding; /* number of RPCs out */
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_segs;      /* layout segments list */
 	struct list_head	plh_segs;      /* layout segments list */
-	nfs4_stateid		plh_stateid;
-	atomic_t		plh_outstanding; /* number of RPCs out */
 	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
 	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
-	u32			plh_barrier; /* ignore lower seqids */
 	unsigned long		plh_retry_timestamp;
 	unsigned long		plh_retry_timestamp;
 	unsigned long		plh_flags;
 	unsigned long		plh_flags;
+	nfs4_stateid		plh_stateid;
+	u32			plh_barrier; /* ignore lower seqids */
 	enum pnfs_iomode	plh_return_iomode;
 	enum pnfs_iomode	plh_return_iomode;
 	loff_t			plh_lwb; /* last write byte for layoutcommit */
 	loff_t			plh_lwb; /* last write byte for layoutcommit */
 	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */
 	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */
@@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 bool pnfs_roc(struct inode *ino);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
 void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       gfp_t gfp_flags);
 					       gfp_t gfp_flags);
 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 
 
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg,
+		   bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+			   const struct pnfs_layout_range *old),
+		   bool (*do_merge)(struct pnfs_layout_segment *lseg,
+			   struct pnfs_layout_segment *old),
+		   struct list_head *free_me);
+
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
@@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
 					nfss->pnfs_curr_ld->id == src->l_type);
 					nfss->pnfs_curr_ld->id == src->l_type);
 }
 }
 
 
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+	if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+		return NFS4_MAX_UINT64;
+	return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+	if (end == NFS4_MAX_UINT64 || end <= offset)
+		return NFS4_MAX_UINT64;
+	return 1 + end - offset;
+}
+
+extern unsigned int layoutstats_timer;
+
 #ifdef NFS_DEBUG
 #ifdef NFS_DEBUG
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 #else
 #else
 static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
 static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
 {
 {
 }
 }
+
 #endif /* NFS_DEBUG */
 #endif /* NFS_DEBUG */
 #else  /* CONFIG_NFS_V4_1 */
 #else  /* CONFIG_NFS_V4_1 */
 
 
@@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 {
 {
 }
 }
 
 
-static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+static inline void
+pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
 {
-	return false;
 }
 }
 
 
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
@@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4_1 */
 
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
 #if IS_ENABLED(CONFIG_NFS_V4_2)
-int pnfs_report_layoutstat(struct inode *inode);
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
 #else
 #else
 static inline int
 static inline int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 {
 {
 	return 0;
 	return 0;
 }
 }

+ 54 - 34
fs/nfs/pnfs_nfs.c

@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	if (ret) {
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
 		cinfo->ds->nwritten -= ret;
 		cinfo->ds->ncommitting += ret;
 		cinfo->ds->ncommitting += ret;
-		bucket->clseg = bucket->wlseg;
-		if (list_empty(src))
+		if (bucket->clseg == NULL)
+			bucket->clseg = pnfs_get_lseg(bucket->wlseg);
+		if (list_empty(src)) {
+			pnfs_put_lseg_locked(bucket->wlseg);
 			bucket->wlseg = NULL;
 			bucket->wlseg = NULL;
-		else
-			pnfs_get_lseg(bucket->clseg);
+		}
 	}
 	}
 	return ret;
 	return ret;
 }
 }
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
 	struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 	struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 	struct pnfs_commit_bucket *bucket;
 	struct pnfs_commit_bucket *bucket;
 	struct pnfs_layout_segment *freeme;
 	struct pnfs_layout_segment *freeme;
+	LIST_HEAD(pages);
 	int i;
 	int i;
 
 
+	spin_lock(cinfo->lock);
 	for (i = idx; i < fl_cinfo->nbuckets; i++) {
 	for (i = idx; i < fl_cinfo->nbuckets; i++) {
 		bucket = &fl_cinfo->buckets[i];
 		bucket = &fl_cinfo->buckets[i];
 		if (list_empty(&bucket->committing))
 		if (list_empty(&bucket->committing))
 			continue;
 			continue;
-		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
-		spin_lock(cinfo->lock);
 		freeme = bucket->clseg;
 		freeme = bucket->clseg;
 		bucket->clseg = NULL;
 		bucket->clseg = NULL;
+		list_splice_init(&bucket->committing, &pages);
 		spin_unlock(cinfo->lock);
 		spin_unlock(cinfo->lock);
+		nfs_retry_commit(&pages, freeme, cinfo, i);
 		pnfs_put_lseg(freeme);
 		pnfs_put_lseg(freeme);
+		spin_lock(cinfo->lock);
 	}
 	}
+	spin_unlock(cinfo->lock);
 }
 }
 
 
 static unsigned int
 static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 		if (!data)
 		if (!data)
 			break;
 			break;
 		data->ds_commit_index = i;
 		data->ds_commit_index = i;
-		spin_lock(cinfo->lock);
-		data->lseg = bucket->clseg;
-		bucket->clseg = NULL;
-		spin_unlock(cinfo->lock);
 		list_add(&data->pages, list);
 		list_add(&data->pages, list);
 		nreq++;
 		nreq++;
 	}
 	}
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 	return nreq;
 	return nreq;
 }
 }
 
 
+static inline
+void pnfs_fetch_commit_bucket_list(struct list_head *pages,
+		struct nfs_commit_data *data,
+		struct nfs_commit_info *cinfo)
+{
+	struct pnfs_commit_bucket *bucket;
+
+	bucket = &cinfo->ds->buckets[data->ds_commit_index];
+	spin_lock(cinfo->lock);
+	list_splice_init(&bucket->committing, pages);
+	data->lseg = bucket->clseg;
+	bucket->clseg = NULL;
+	spin_unlock(cinfo->lock);
+
+}
+
 /* This follows nfs_commit_list pretty closely */
 /* This follows nfs_commit_list pretty closely */
 int
 int
 pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	if (!list_empty(mds_pages)) {
 	if (!list_empty(mds_pages)) {
 		data = nfs_commitdata_alloc();
 		data = nfs_commitdata_alloc();
 		if (data != NULL) {
 		if (data != NULL) {
-			data->lseg = NULL;
+			data->ds_commit_index = -1;
 			list_add(&data->pages, &list);
 			list_add(&data->pages, &list);
 			nreq++;
 			nreq++;
 		} else {
 		} else {
@@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 
 
 	list_for_each_entry_safe(data, tmp, &list, pages) {
 	list_for_each_entry_safe(data, tmp, &list, pages) {
 		list_del_init(&data->pages);
 		list_del_init(&data->pages);
-		if (!data->lseg) {
+		if (data->ds_commit_index < 0) {
 			nfs_init_commit(data, mds_pages, NULL, cinfo);
 			nfs_init_commit(data, mds_pages, NULL, cinfo);
 			nfs_initiate_commit(NFS_CLIENT(inode), data,
 			nfs_initiate_commit(NFS_CLIENT(inode), data,
 					    NFS_PROTO(data->inode),
 					    NFS_PROTO(data->inode),
 					    data->mds_ops, how, 0);
 					    data->mds_ops, how, 0);
 		} else {
 		} else {
-			struct pnfs_commit_bucket *buckets;
+			LIST_HEAD(pages);
 
 
-			buckets = cinfo->ds->buckets;
-			nfs_init_commit(data,
-					&buckets[data->ds_commit_index].committing,
-					data->lseg,
-					cinfo);
+			pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+			nfs_init_commit(data, &pages, data->lseg, cinfo);
 			initiate_commit(data, how);
 			initiate_commit(data, how);
 		}
 		}
 	}
 	}
@@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
 	return false;
 	return false;
 }
 }
 
 
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
 static bool
 static bool
 _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
 _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
 			       const struct list_head *dsaddrs2)
 			       const struct list_head *dsaddrs2)
 {
 {
 	struct nfs4_pnfs_ds_addr *da1, *da2;
 	struct nfs4_pnfs_ds_addr *da1, *da2;
-
-	/* step through both lists, comparing as we go */
-	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
-	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
-	     da1 != NULL && da2 != NULL;
-	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
-	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
-		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
-				   (struct sockaddr *)&da2->da_addr))
-			return false;
+	struct sockaddr *sa1, *sa2;
+	bool match = false;
+
+	list_for_each_entry(da1, dsaddrs1, da_node) {
+		sa1 = (struct sockaddr *)&da1->da_addr;
+		match = false;
+		list_for_each_entry(da2, dsaddrs2, da_node) {
+			sa2 = (struct sockaddr *)&da2->da_addr;
+			match = same_sockaddr(sa1, sa2);
+			if (match)
+				break;
+		}
+		if (!match)
+			break;
 	}
 	}
-	if (da1 == NULL && da2 == NULL)
-		return true;
-
-	return false;
+	return match;
 }
 }
 
 
 /*
 /*
@@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	}
 	}
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 	cinfo->ds->nwritten++;
 	cinfo->ds->nwritten++;
-	spin_unlock(cinfo->lock);
 
 
-	nfs_request_add_commit_list(req, list, cinfo);
+	nfs_request_add_commit_list_locked(req, list, cinfo);
+	spin_unlock(cinfo->lock);
+	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 
 

+ 5 - 2
fs/nfs/super.c

@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
 	ret = nfs_register_sysctl();
 	ret = nfs_register_sysctl();
 	if (ret < 0)
 	if (ret < 0)
 		goto error_2;
 		goto error_2;
-	register_shrinker(&acl_shrinker);
+	ret = register_shrinker(&acl_shrinker);
+	if (ret < 0)
+		goto error_3;
 	return 0;
 	return 0;
-
+error_3:
+	nfs_unregister_sysctl();
 error_2:
 error_2:
 	unregister_nfs4_fs();
 	unregister_nfs4_fs();
 error_1:
 error_1:

+ 25 - 11
fs/nfs/write.c

@@ -767,6 +767,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
 	return NULL;
 	return NULL;
 }
 }
 
 
+/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold the cinfo->lock, and the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+			    struct nfs_commit_info *cinfo)
+{
+	set_bit(PG_CLEAN, &req->wb_flags);
+	nfs_list_add_request(req, dst);
+	cinfo->mds->ncommit++;
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
 /**
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
  * @req: pointer to a struct nfs_page
@@ -784,13 +806,10 @@ void
 nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 			    struct nfs_commit_info *cinfo)
 			    struct nfs_commit_info *cinfo)
 {
 {
-	set_bit(PG_CLEAN, &(req)->wb_flags);
 	spin_lock(cinfo->lock);
 	spin_lock(cinfo->lock);
-	nfs_list_add_request(req, dst);
-	cinfo->mds->ncommit++;
+	nfs_request_add_commit_list_locked(req, dst, cinfo);
 	spin_unlock(cinfo->lock);
 	spin_unlock(cinfo->lock);
-	if (!cinfo->dreq)
-		nfs_mark_page_unstable(req->wb_page);
+	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
 
@@ -1793,7 +1812,7 @@ out_mark_dirty:
 	return res;
 	return res;
 }
 }
 
 
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int flags = FLUSH_SYNC;
 	int flags = FLUSH_SYNC;
@@ -1828,11 +1847,6 @@ out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 	return ret;
 }
 }
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	return nfs_commit_unstable_pages(inode, wbc);
-}
 EXPORT_SYMBOL_GPL(nfs_write_inode);
 EXPORT_SYMBOL_GPL(nfs_write_inode);
 
 
 /*
 /*

+ 1 - 1
fs/nfsd/blocklayoutxdr.c

@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 	}
 	}
 
 
 	nr_iomaps = be32_to_cpup(p++);
 	nr_iomaps = be32_to_cpup(p++);
-	expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+	expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
 	if (len != expected) {
 	if (len != expected) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
 		dprintk("%s: extent array size mismatch: %u/%u\n",
 			__func__, len, expected);
 			__func__, len, expected);

+ 0 - 15
fs/nfsd/blocklayoutxdr.h

@@ -7,13 +7,6 @@
 struct iomap;
 struct iomap;
 struct xdr_stream;
 struct xdr_stream;
 
 
-enum pnfs_block_extent_state {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2,
-	PNFS_BLOCK_NONE_DATA		= 3,
-};
-
 struct pnfs_block_extent {
 struct pnfs_block_extent {
 	struct nfsd4_deviceid		vol_id;
 	struct nfsd4_deviceid		vol_id;
 	u64				foff;
 	u64				foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
 	u64				soff;
 	u64				soff;
 	enum pnfs_block_extent_state	es;
 	enum pnfs_block_extent_state	es;
 };
 };
-#define NFS4_BLOCK_EXTENT_SIZE		44
-
-enum pnfs_block_volume_type {
-	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
-	PNFS_BLOCK_VOLUME_SLICE		= 1,
-	PNFS_BLOCK_VOLUME_CONCAT	= 2,
-	PNFS_BLOCK_VOLUME_STRIPE	= 3,
-};
 
 
 /*
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
  * Random upper cap for the uuid length to avoid unbounded allocation.

+ 18 - 0
include/linux/nfs4.h

@@ -547,6 +547,24 @@ enum pnfs_notify_deviceid_type4 {
 	NOTIFY_DEVICEID4_DELETE = 1 << 2,
 	NOTIFY_DEVICEID4_DELETE = 1 << 2,
 };
 };
 
 
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+enum pnfs_block_extent_state {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2,
+	PNFS_BLOCK_NONE_DATA		= 3,
+};
+
+/* on the wire size of a block layout extent */
+#define PNFS_BLOCK_EXTENT_SIZE \
+	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
 #define NFL4_UFLG_MASK			0x0000003F
 #define NFL4_UFLG_MASK			0x0000003F
 #define NFL4_UFLG_DENSE			0x00000001
 #define NFL4_UFLG_DENSE			0x00000001
 #define NFL4_UFLG_COMMIT_THRU_MDS	0x00000002
 #define NFL4_UFLG_COMMIT_THRU_MDS	0x00000002

+ 1 - 1
include/linux/nfs_fs.h

@@ -353,7 +353,6 @@ extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct inode *, int);
 extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_open(struct inode *, struct file *);
-extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -371,6 +370,7 @@ extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struc
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
 extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
 extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
+extern void nfs_file_clear_open_context(struct file *flip);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
 extern u64 nfs_compat_user_ino64(u64 fileid);

+ 5 - 0
include/linux/nfs_fs_sb.h

@@ -173,6 +173,11 @@ struct nfs_server {
 						   set of attributes supported
 						   set of attributes supported
 						   on this filesystem excluding
 						   on this filesystem excluding
 						   the label support bit. */
 						   the label support bit. */
+	u32			exclcreat_bitmask[3];
+						/* V4 bitmask representing the
+						   set of attributes supported
+						   on this filesystem for the
+						   exclusive create. */
 	u32			cache_consistency_bitmask[3];
 	u32			cache_consistency_bitmask[3];
 						/* V4 bitmask representing the subset
 						/* V4 bitmask representing the subset
 						   of change attribute, size, ctime
 						   of change attribute, size, ctime

+ 5 - 3
include/linux/nfs_xdr.h

@@ -379,7 +379,7 @@ struct nfs_openargs {
 	struct stateowner_id	id;
 	struct stateowner_id	id;
 	union {
 	union {
 		struct {
 		struct {
-			struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+			struct iattr *  attrs;    /* UNCHECKED, GUARDED, EXCLUSIVE4_1 */
 			nfs4_verifier   verifier; /* EXCLUSIVE */
 			nfs4_verifier   verifier; /* EXCLUSIVE */
 		};
 		};
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
@@ -389,7 +389,7 @@ struct nfs_openargs {
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
 	const u32 *		bitmask;
 	const u32 *		open_bitmap;
 	const u32 *		open_bitmap;
-	__u32			claim;
+	enum open_claim_type4	claim;
 	enum createmode4	createmode;
 	enum createmode4	createmode;
 	const struct nfs4_label *label;
 	const struct nfs4_label *label;
 };
 };
@@ -406,8 +406,8 @@ struct nfs_openres {
 	const struct nfs_server *server;
 	const struct nfs_server *server;
 	fmode_t			delegation_type;
 	fmode_t			delegation_type;
 	nfs4_stateid		delegation;
 	nfs4_stateid		delegation;
+	unsigned long		pagemod_limit;
 	__u32			do_recall;
 	__u32			do_recall;
-	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
 	__u32			attrset[NFS4_BITMAP_SIZE];
 	struct nfs4_string	*owner;
 	struct nfs4_string	*owner;
 	struct nfs4_string	*group_owner;
 	struct nfs4_string	*group_owner;
@@ -1057,11 +1057,13 @@ struct nfs4_statfs_res {
 struct nfs4_server_caps_arg {
 struct nfs4_server_caps_arg {
 	struct nfs4_sequence_args	seq_args;
 	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh		       *fhandle;
 	struct nfs_fh		       *fhandle;
+	const u32 *			bitmask;
 };
 };
 
 
 struct nfs4_server_caps_res {
 struct nfs4_server_caps_res {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs4_sequence_res	seq_res;
 	u32				attr_bitmask[3];
 	u32				attr_bitmask[3];
+	u32				exclcreat_bitmask[3];
 	u32				acl_bitmask;
 	u32				acl_bitmask;
 	u32				has_links;
 	u32				has_links;
 	u32				has_symlinks;
 	u32				has_symlinks;

+ 20 - 7
include/linux/sunrpc/addr.h

@@ -46,8 +46,8 @@ static inline void rpc_set_port(struct sockaddr *sap,
 #define IPV6_SCOPE_DELIMITER		'%'
 #define IPV6_SCOPE_DELIMITER		'%'
 #define IPV6_SCOPE_ID_LEN		sizeof("%nnnnnnnnnn")
 #define IPV6_SCOPE_ID_LEN		sizeof("%nnnnnnnnnn")
 
 
-static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1,
-				   const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr4(const struct sockaddr *sap1,
+				 const struct sockaddr *sap2)
 {
 {
 	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
 	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
 	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
 	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
@@ -67,8 +67,8 @@ static inline bool __rpc_copy_addr4(struct sockaddr *dst,
 }
 }
 
 
 #if IS_ENABLED(CONFIG_IPV6)
 #if IS_ENABLED(CONFIG_IPV6)
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
-				   const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
+				 const struct sockaddr *sap2)
 {
 {
 	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
 	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
@@ -93,7 +93,7 @@ static inline bool __rpc_copy_addr6(struct sockaddr *dst,
 	return true;
 	return true;
 }
 }
 #else	/* !(IS_ENABLED(CONFIG_IPV6) */
 #else	/* !(IS_ENABLED(CONFIG_IPV6) */
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
 				   const struct sockaddr *sap2)
 				   const struct sockaddr *sap2)
 {
 {
 	return false;
 	return false;
@@ -122,14 +122,27 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
 	if (sap1->sa_family == sap2->sa_family) {
 	if (sap1->sa_family == sap2->sa_family) {
 		switch (sap1->sa_family) {
 		switch (sap1->sa_family) {
 		case AF_INET:
 		case AF_INET:
-			return __rpc_cmp_addr4(sap1, sap2);
+			return rpc_cmp_addr4(sap1, sap2);
 		case AF_INET6:
 		case AF_INET6:
-			return __rpc_cmp_addr6(sap1, sap2);
+			return rpc_cmp_addr6(sap1, sap2);
 		}
 		}
 	}
 	}
 	return false;
 	return false;
 }
 }
 
 
+/**
+ * rpc_cmp_addr_port - compare the address and port number of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ */
+static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1,
+				     const struct sockaddr *sap2)
+{
+	if (!rpc_cmp_addr(sap1, sap2))
+		return false;
+	return rpc_get_port(sap1) == rpc_get_port(sap2);
+}
+
 /**
 /**
  * rpc_copy_addr - copy the address portion of one sockaddr to another
  * rpc_copy_addr - copy the address portion of one sockaddr to another
  * @dst: destination sockaddr
  * @dst: destination sockaddr

+ 6 - 2
include/linux/sunrpc/auth.h

@@ -18,9 +18,13 @@
 #include <linux/atomic.h>
 #include <linux/atomic.h>
 #include <linux/rcupdate.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/uidgid.h>
+#include <linux/utsname.h>
 
 
-/* size of the nodename buffer */
-#define UNX_MAXNODENAME	32
+/*
+ * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes,
+ * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes.
+ */
+#define UNX_MAXNODENAME	__NEW_UTS_LEN
 
 
 struct rpcsec_gss_info;
 struct rpcsec_gss_info;
 
 

+ 1 - 1
include/linux/sunrpc/xprtrdma.h

@@ -49,7 +49,7 @@
  * a single chunk type per message is supported currently.
  * a single chunk type per message is supported currently.
  */
  */
 #define RPCRDMA_MIN_SLOT_TABLE	(2U)
 #define RPCRDMA_MIN_SLOT_TABLE	(2U)
-#define RPCRDMA_DEF_SLOT_TABLE	(32U)
+#define RPCRDMA_DEF_SLOT_TABLE	(128U)
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 
 
 #define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */
 #define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */

+ 0 - 46
include/rdma/ib_verbs.h

@@ -2759,52 +2759,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
 		dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
 		dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
 }
 }
 
 
-/**
- * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
- *   by an HCA.
- * @pd: The protection domain associated assigned to the registered region.
- * @phys_buf_array: Specifies a list of physical buffers to use in the
- *   memory region.
- * @num_phys_buf: Specifies the size of the phys_buf_array.
- * @mr_access_flags: Specifies the memory access rights.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
-			     struct ib_phys_buf *phys_buf_array,
-			     int num_phys_buf,
-			     int mr_access_flags,
-			     u64 *iova_start);
-
-/**
- * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
- *   Conceptually, this call performs the functions deregister memory region
- *   followed by register physical memory region.  Where possible,
- *   resources are reused instead of deallocated and reallocated.
- * @mr: The memory region to modify.
- * @mr_rereg_mask: A bit-mask used to indicate which of the following
- *   properties of the memory region are being modified.
- * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
- *   the new protection domain to associated with the memory region,
- *   otherwise, this parameter is ignored.
- * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- *   field specifies a list of physical buffers to use in the new
- *   translation, otherwise, this parameter is ignored.
- * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- *   field specifies the size of the phys_buf_array, otherwise, this
- *   parameter is ignored.
- * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
- *   field specifies the new memory access rights, otherwise, this
- *   parameter is ignored.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-int ib_rereg_phys_mr(struct ib_mr *mr,
-		     int mr_rereg_mask,
-		     struct ib_pd *pd,
-		     struct ib_phys_buf *phys_buf_array,
-		     int num_phys_buf,
-		     int mr_access_flags,
-		     u64 *iova_start);
-
 /**
 /**
  * ib_query_mr - Retrieves information about a specific memory region.
  * ib_query_mr - Retrieves information about a specific memory region.
  * @mr: The memory region to retrieve information about.
  * @mr: The memory region to retrieve information about.

+ 1 - 1
include/uapi/linux/nfs4.h

@@ -15,7 +15,7 @@
 
 
 #include <linux/types.h>
 #include <linux/types.h>
 
 
-#define NFS4_BITMAP_SIZE	2
+#define NFS4_BITMAP_SIZE	3
 #define NFS4_VERIFIER_SIZE	8
 #define NFS4_VERIFIER_SIZE	8
 #define NFS4_STATEID_SEQID_SIZE 4
 #define NFS4_STATEID_SEQID_SIZE 4
 #define NFS4_STATEID_OTHER_SIZE 12
 #define NFS4_STATEID_OTHER_SIZE 12

+ 1 - 1
net/sunrpc/auth_unix.c

@@ -23,7 +23,7 @@ struct unx_cred {
 };
 };
 #define uc_uid			uc_base.cr_uid
 #define uc_uid			uc_base.cr_uid
 
 
-#define UNX_WRITESLACK		(21 + (UNX_MAXNODENAME >> 2))
+#define UNX_WRITESLACK		(21 + XDR_QUADLEN(UNX_MAXNODENAME))
 
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 # define RPCDBG_FACILITY	RPCDBG_AUTH

+ 19 - 0
net/sunrpc/xprtrdma/fmr_ops.c

@@ -39,6 +39,25 @@ static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	    struct rpcrdma_create_data_internal *cdata)
 	    struct rpcrdma_create_data_internal *cdata)
 {
 {
+	struct ib_device_attr *devattr = &ia->ri_devattr;
+	struct ib_mr *mr;
+
+	/* Obtain an lkey to use for the regbufs, which are
+	 * protected from remote access.
+	 */
+	if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+		ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+	} else {
+		mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(mr)) {
+			pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+			       __func__, PTR_ERR(mr));
+			return -ENOMEM;
+		}
+		ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+		ia->ri_dma_mr = mr;
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 

+ 5 - 0
net/sunrpc/xprtrdma/frwr_ops.c

@@ -189,6 +189,11 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	struct ib_device_attr *devattr = &ia->ri_devattr;
 	struct ib_device_attr *devattr = &ia->ri_devattr;
 	int depth, delta;
 	int depth, delta;
 
 
+	/* Obtain an lkey to use for the regbufs, which are
+	 * protected from remote access.
+	 */
+	ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+
 	ia->ri_max_frmr_depth =
 	ia->ri_max_frmr_depth =
 			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
 			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
 			      devattr->max_fast_reg_page_list_len);
 			      devattr->max_fast_reg_page_list_len);

+ 24 - 1
net/sunrpc/xprtrdma/physical_ops.c

@@ -23,6 +23,29 @@ static int
 physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 		 struct rpcrdma_create_data_internal *cdata)
 		 struct rpcrdma_create_data_internal *cdata)
 {
 {
+	struct ib_device_attr *devattr = &ia->ri_devattr;
+	struct ib_mr *mr;
+
+	/* Obtain an rkey to use for RPC data payloads.
+	 */
+	mr = ib_get_dma_mr(ia->ri_pd,
+			   IB_ACCESS_LOCAL_WRITE |
+			   IB_ACCESS_REMOTE_WRITE |
+			   IB_ACCESS_REMOTE_READ);
+	if (IS_ERR(mr)) {
+		pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+		       __func__, PTR_ERR(mr));
+		return -ENOMEM;
+	}
+	ia->ri_dma_mr = mr;
+
+	/* Obtain an lkey to use for regbufs.
+	 */
+	if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+		ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+	else
+		ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -51,7 +74,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
 
 	rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
 	rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
-	seg->mr_rkey = ia->ri_bind_mem->rkey;
+	seg->mr_rkey = ia->ri_dma_mr->rkey;
 	seg->mr_base = seg->mr_dma;
 	seg->mr_base = seg->mr_dma;
 	seg->mr_nsegs = 1;
 	seg->mr_nsegs = 1;
 	return 1;
 	return 1;

+ 101 - 96
net/sunrpc/xprtrdma/rpc_rdma.c

@@ -71,6 +71,67 @@ static const char transfertypes[][12] = {
 };
 };
 #endif
 #endif
 
 
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+	unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+	return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+	unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+	return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+	size_t tlen = buf->tail[0].iov_len;
+	size_t skip = tlen & 3;
+
+	/* Do not include the tail if it is only an XDR pad */
+	if (tlen < 4)
+		return 0;
+
+	/* xdr_write_pages() adds a pad at the beginning of the tail
+	 * if the content in "buf->pages" is unaligned. Force the
+	 * tail's actual content to land at the next XDR position
+	 * after the head instead.
+	 */
+	if (skip) {
+		unsigned char *src, *dst;
+		unsigned int count;
+
+		src = buf->tail[0].iov_base;
+		dst = buf->head[0].iov_base;
+		dst += buf->head[0].iov_len;
+
+		src += skip;
+		tlen -= skip;
+
+		dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
+			__func__, skip, dst, src, tlen);
+
+		for (count = tlen; count; count--)
+			*dst++ = *src++;
+	}
+
+	return tlen;
+}
+
 /*
 /*
  * Chunk assembly from upper layer xdr_buf.
  * Chunk assembly from upper layer xdr_buf.
  *
  *
@@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	if (len && n == nsegs)
 	if (len && n == nsegs)
 		return -EIO;
 		return -EIO;
 
 
+	/* When encoding the read list, the tail is always sent inline */
+	if (type == rpcrdma_readch)
+		return n;
+
 	if (xdrbuf->tail[0].iov_len) {
 	if (xdrbuf->tail[0].iov_len) {
 		/* the rpcrdma protocol allows us to omit any trailing
 		/* the rpcrdma protocol allows us to omit any trailing
 		 * xdr pad bytes, saving the server an RDMA operation. */
 		 * xdr pad bytes, saving the server an RDMA operation. */
@@ -297,8 +362,7 @@ out:
  * pre-registered memory buffer for this request. For small amounts
  * pre-registered memory buffer for this request. For small amounts
  * of data, this is efficient. The cutoff value is tunable.
  * of data, this is efficient. The cutoff value is tunable.
  */
  */
-static int
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
 {
 {
 	int i, npages, curlen;
 	int i, npages, curlen;
 	int copy_len;
 	int copy_len;
@@ -310,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 	destp = rqst->rq_svec[0].iov_base;
 	destp = rqst->rq_svec[0].iov_base;
 	curlen = rqst->rq_svec[0].iov_len;
 	curlen = rqst->rq_svec[0].iov_len;
 	destp += curlen;
 	destp += curlen;
-	/*
-	 * Do optional padding where it makes sense. Alignment of write
-	 * payload can help the server, if our setting is accurate.
-	 */
-	pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
-	if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
-		pad = 0;	/* don't pad this request */
 
 
-	dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
-		__func__, pad, destp, rqst->rq_slen, curlen);
+	dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
+		__func__, destp, rqst->rq_slen, curlen);
 
 
 	copy_len = rqst->rq_snd_buf.page_len;
 	copy_len = rqst->rq_snd_buf.page_len;
 
 
@@ -355,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 		page_base = 0;
 		page_base = 0;
 	}
 	}
 	/* header now contains entire send message */
 	/* header now contains entire send message */
-	return pad;
 }
 }
 
 
 /*
 /*
@@ -380,7 +436,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	char *base;
 	char *base;
-	size_t rpclen, padlen;
+	size_t rpclen;
 	ssize_t hdrlen;
 	ssize_t hdrlen;
 	enum rpcrdma_chunktype rtype, wtype;
 	enum rpcrdma_chunktype rtype, wtype;
 	struct rpcrdma_msg *headerp;
 	struct rpcrdma_msg *headerp;
@@ -402,28 +458,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	/*
 	/*
 	 * Chunks needed for results?
 	 * Chunks needed for results?
 	 *
 	 *
+	 * o Read ops return data as write chunk(s), header as inline.
 	 * o If the expected result is under the inline threshold, all ops
 	 * o If the expected result is under the inline threshold, all ops
-	 *   return as inline (but see later).
+	 *   return as inline.
 	 * o Large non-read ops return as a single reply chunk.
 	 * o Large non-read ops return as a single reply chunk.
-	 * o Large read ops return data as write chunk(s), header as inline.
-	 *
-	 * Note: the NFS code sending down multiple result segments implies
-	 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
-	 */
-
-	/*
-	 * This code can handle read chunks, write chunks OR reply
-	 * chunks -- only one type. If the request is too big to fit
-	 * inline, then we will choose read chunks. If the request is
-	 * a READ, then use write chunks to separate the file data
-	 * into pages; otherwise use reply chunks.
 	 */
 	 */
-	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-		wtype = rpcrdma_noch;
-	else if (rqst->rq_rcv_buf.page_len == 0)
-		wtype = rpcrdma_replych;
-	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+	if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
 		wtype = rpcrdma_writech;
 		wtype = rpcrdma_writech;
+	else if (rpcrdma_results_inline(rqst))
+		wtype = rpcrdma_noch;
 	else
 	else
 		wtype = rpcrdma_replych;
 		wtype = rpcrdma_replych;
 
 
@@ -432,21 +475,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 *
 	 *
 	 * o If the total request is under the inline threshold, all ops
 	 * o If the total request is under the inline threshold, all ops
 	 *   are sent as inline.
 	 *   are sent as inline.
-	 * o Large non-write ops are sent with the entire message as a
-	 *   single read chunk (protocol 0-position special case).
 	 * o Large write ops transmit data as read chunk(s), header as
 	 * o Large write ops transmit data as read chunk(s), header as
 	 *   inline.
 	 *   inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
 	 *
 	 *
-	 * Note: the NFS code sending down multiple argument segments
-	 * implies the op is a write.
-	 * TBD check NFSv4 setacl
+	 * This assumes that the upper layer does not present a request
+	 * that both has a data payload, and whose non-data arguments
+	 * by themselves are larger than the inline threshold.
 	 */
 	 */
-	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+	if (rpcrdma_args_inline(rqst)) {
 		rtype = rpcrdma_noch;
 		rtype = rpcrdma_noch;
-	else if (rqst->rq_snd_buf.page_len == 0)
-		rtype = rpcrdma_areadch;
-	else
+	} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
 		rtype = rpcrdma_readch;
 		rtype = rpcrdma_readch;
+	} else {
+		r_xprt->rx_stats.nomsg_call_count++;
+		headerp->rm_type = htonl(RDMA_NOMSG);
+		rtype = rpcrdma_areadch;
+		rpclen = 0;
+	}
 
 
 	/* The following simplification is not true forever */
 	/* The following simplification is not true forever */
 	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
 	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
@@ -458,7 +505,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	}
 	}
 
 
 	hdrlen = RPCRDMA_HDRLEN_MIN;
 	hdrlen = RPCRDMA_HDRLEN_MIN;
-	padlen = 0;
 
 
 	/*
 	/*
 	 * Pull up any extra send data into the preregistered buffer.
 	 * Pull up any extra send data into the preregistered buffer.
@@ -467,45 +513,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 */
 	 */
 	if (rtype == rpcrdma_noch) {
 	if (rtype == rpcrdma_noch) {
 
 
-		padlen = rpcrdma_inline_pullup(rqst,
-						RPCRDMA_INLINE_PAD_VALUE(rqst));
-
-		if (padlen) {
-			headerp->rm_type = rdma_msgp;
-			headerp->rm_body.rm_padded.rm_align =
-				cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
-			headerp->rm_body.rm_padded.rm_thresh =
-				cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
-			headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
-			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
-			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
-			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-			if (wtype != rpcrdma_noch) {
-				dprintk("RPC:       %s: invalid chunk list\n",
-					__func__);
-				return -EIO;
-			}
-		} else {
-			headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-			headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-			headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-			/* new length after pullup */
-			rpclen = rqst->rq_svec[0].iov_len;
-			/*
-			 * Currently we try to not actually use read inline.
-			 * Reply chunks have the desirable property that
-			 * they land, packed, directly in the target buffers
-			 * without headers, so they require no fixup. The
-			 * additional RDMA Write op sends the same amount
-			 * of data, streams on-the-wire and adds no overhead
-			 * on receive. Therefore, we request a reply chunk
-			 * for non-writes wherever feasible and efficient.
-			 */
-			if (wtype == rpcrdma_noch)
-				wtype = rpcrdma_replych;
-		}
-	}
+		rpcrdma_inline_pullup(rqst);
 
 
+		headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+		headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+		headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+		/* new length after pullup */
+		rpclen = rqst->rq_svec[0].iov_len;
+	} else if (rtype == rpcrdma_readch)
+		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
 	if (rtype != rpcrdma_noch) {
 	if (rtype != rpcrdma_noch) {
 		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
 		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
 					       headerp, rtype);
 					       headerp, rtype);
@@ -518,9 +534,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	if (hdrlen < 0)
 	if (hdrlen < 0)
 		return hdrlen;
 		return hdrlen;
 
 
-	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
 		" headerp 0x%p base 0x%p lkey 0x%x\n",
 		" headerp 0x%p base 0x%p lkey 0x%x\n",
-		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+		__func__, transfertypes[wtype], hdrlen, rpclen,
 		headerp, base, rdmab_lkey(req->rl_rdmabuf));
 		headerp, base, rdmab_lkey(req->rl_rdmabuf));
 
 
 	/*
 	/*
@@ -534,26 +550,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	req->rl_send_iov[0].length = hdrlen;
 	req->rl_send_iov[0].length = hdrlen;
 	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
 	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
 
 
+	req->rl_niovs = 1;
+	if (rtype == rpcrdma_areadch)
+		return 0;
+
 	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
 	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
 	req->rl_send_iov[1].length = rpclen;
 	req->rl_send_iov[1].length = rpclen;
 	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
 	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
 
 
 	req->rl_niovs = 2;
 	req->rl_niovs = 2;
-
-	if (padlen) {
-		struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
-		req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
-		req->rl_send_iov[2].length = padlen;
-		req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-
-		req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
-		req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
-		req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-
-		req->rl_niovs = 4;
-	}
-
 	return 0;
 	return 0;
 }
 }
 
 

+ 35 - 42
net/sunrpc/xprtrdma/transport.c

@@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
 }
 }
 
 
 static void
 static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
 {
 {
-	struct sockaddr *sap = (struct sockaddr *)
-					&rpcx_to_rdmad(xprt).addr;
 	char buf[128];
 	char buf[128];
 
 
 	switch (sap->sa_family) {
 	switch (sap->sa_family) {
@@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args)
 	struct rpc_xprt *xprt;
 	struct rpc_xprt *xprt;
 	struct rpcrdma_xprt *new_xprt;
 	struct rpcrdma_xprt *new_xprt;
 	struct rpcrdma_ep *new_ep;
 	struct rpcrdma_ep *new_ep;
-	struct sockaddr_in *sin;
+	struct sockaddr *sap;
 	int rc;
 	int rc;
 
 
 	if (args->addrlen > sizeof(xprt->addr)) {
 	if (args->addrlen > sizeof(xprt->addr)) {
@@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args)
 	 * Set up RDMA-specific connect data.
 	 * Set up RDMA-specific connect data.
 	 */
 	 */
 
 
-	/* Put server RDMA address in local cdata */
-	memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+	sap = (struct sockaddr *)&cdata.addr;
+	memcpy(sap, args->dstaddr, args->addrlen);
 
 
 	/* Ensure xprt->addr holds valid server TCP (not RDMA)
 	/* Ensure xprt->addr holds valid server TCP (not RDMA)
 	 * address, for any side protocols which peek at it */
 	 * address, for any side protocols which peek at it */
 	xprt->prot = IPPROTO_TCP;
 	xprt->prot = IPPROTO_TCP;
 	xprt->addrlen = args->addrlen;
 	xprt->addrlen = args->addrlen;
-	memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+	memcpy(&xprt->addr, sap, xprt->addrlen);
 
 
-	sin = (struct sockaddr_in *)&cdata.addr;
-	if (ntohs(sin->sin_port) != 0)
+	if (rpc_get_port(sap))
 		xprt_set_bound(xprt);
 		xprt_set_bound(xprt);
 
 
-	dprintk("RPC:       %s: %pI4:%u\n",
-		__func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
-	/* Set max requests */
 	cdata.max_requests = xprt->max_reqs;
 	cdata.max_requests = xprt->max_reqs;
 
 
-	/* Set some length limits */
 	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
 	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
 	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
 	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
 
 
@@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args)
 
 
 	new_xprt = rpcx_to_rdmax(xprt);
 	new_xprt = rpcx_to_rdmax(xprt);
 
 
-	rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
-				xprt_rdma_memreg_strategy);
+	rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
 	if (rc)
 	if (rc)
 		goto out1;
 		goto out1;
 
 
@@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args)
 	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
 	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
 			  xprt_rdma_connect_worker);
 			  xprt_rdma_connect_worker);
 
 
-	xprt_rdma_format_addresses(xprt);
+	xprt_rdma_format_addresses(xprt, sap);
 	xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
 	xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
 	if (xprt->max_payload == 0)
 	if (xprt->max_payload == 0)
 		goto out4;
 		goto out4;
@@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (!try_module_get(THIS_MODULE))
 	if (!try_module_get(THIS_MODULE))
 		goto out4;
 		goto out4;
 
 
+	dprintk("RPC:       %s: %s:%s\n", __func__,
+		xprt->address_strings[RPC_DISPLAY_ADDR],
+		xprt->address_strings[RPC_DISPLAY_PORT]);
 	return xprt;
 	return xprt;
 
 
 out4:
 out4:
@@ -653,31 +647,30 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 	if (xprt_connected(xprt))
 	if (xprt_connected(xprt))
 		idle_time = (long)(jiffies - xprt->last_used) / HZ;
 		idle_time = (long)(jiffies - xprt->last_used) / HZ;
 
 
-	seq_printf(seq,
-	  "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
-	  "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
-	   0,	/* need a local port? */
-	   xprt->stat.bind_count,
-	   xprt->stat.connect_count,
-	   xprt->stat.connect_time,
-	   idle_time,
-	   xprt->stat.sends,
-	   xprt->stat.recvs,
-	   xprt->stat.bad_xids,
-	   xprt->stat.req_u,
-	   xprt->stat.bklog_u,
-
-	   r_xprt->rx_stats.read_chunk_count,
-	   r_xprt->rx_stats.write_chunk_count,
-	   r_xprt->rx_stats.reply_chunk_count,
-	   r_xprt->rx_stats.total_rdma_request,
-	   r_xprt->rx_stats.total_rdma_reply,
-	   r_xprt->rx_stats.pullup_copy_count,
-	   r_xprt->rx_stats.fixup_copy_count,
-	   r_xprt->rx_stats.hardway_register_count,
-	   r_xprt->rx_stats.failed_marshal_count,
-	   r_xprt->rx_stats.bad_reply_count);
+	seq_puts(seq, "\txprt:\trdma ");
+	seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+		   0,	/* need a local port? */
+		   xprt->stat.bind_count,
+		   xprt->stat.connect_count,
+		   xprt->stat.connect_time,
+		   idle_time,
+		   xprt->stat.sends,
+		   xprt->stat.recvs,
+		   xprt->stat.bad_xids,
+		   xprt->stat.req_u,
+		   xprt->stat.bklog_u);
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+		   r_xprt->rx_stats.read_chunk_count,
+		   r_xprt->rx_stats.write_chunk_count,
+		   r_xprt->rx_stats.reply_chunk_count,
+		   r_xprt->rx_stats.total_rdma_request,
+		   r_xprt->rx_stats.total_rdma_reply,
+		   r_xprt->rx_stats.pullup_copy_count,
+		   r_xprt->rx_stats.fixup_copy_count,
+		   r_xprt->rx_stats.hardway_register_count,
+		   r_xprt->rx_stats.failed_marshal_count,
+		   r_xprt->rx_stats.bad_reply_count,
+		   r_xprt->rx_stats.nomsg_call_count);
 }
 }
 
 
 static int
 static int

+ 79 - 155
net/sunrpc/xprtrdma/verbs.c

@@ -52,6 +52,7 @@
 #include <linux/prefetch.h>
 #include <linux/prefetch.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/addr.h>
 #include <asm/bitops.h>
 #include <asm/bitops.h>
+#include <linux/module.h> /* try_module_get()/module_put() */
 
 
 #include "xprt_rdma.h"
 #include "xprt_rdma.h"
 
 
@@ -414,6 +415,14 @@ connected:
 	return 0;
 	return 0;
 }
 }
 
 
+static void rpcrdma_destroy_id(struct rdma_cm_id *id)
+{
+	if (id) {
+		module_put(id->device->owner);
+		rdma_destroy_id(id);
+	}
+}
+
 static struct rdma_cm_id *
 static struct rdma_cm_id *
 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 			struct rpcrdma_ia *ia, struct sockaddr *addr)
 			struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -440,6 +449,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	}
 	}
 	wait_for_completion_interruptible_timeout(&ia->ri_done,
 	wait_for_completion_interruptible_timeout(&ia->ri_done,
 				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+
+	/* FIXME:
+	 * Until xprtrdma supports DEVICE_REMOVAL, the provider must
+	 * be pinned while there are active NFS/RDMA mounts to prevent
+	 * hangs and crashes at umount time.
+	 */
+	if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
+		dprintk("RPC:       %s: Failed to get device module\n",
+			__func__);
+		ia->ri_async_rc = -ENODEV;
+	}
 	rc = ia->ri_async_rc;
 	rc = ia->ri_async_rc;
 	if (rc)
 	if (rc)
 		goto out;
 		goto out;
@@ -449,16 +469,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	if (rc) {
 	if (rc) {
 		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
 		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
 			__func__, rc);
 			__func__, rc);
-		goto out;
+		goto put;
 	}
 	}
 	wait_for_completion_interruptible_timeout(&ia->ri_done,
 	wait_for_completion_interruptible_timeout(&ia->ri_done,
 				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 	rc = ia->ri_async_rc;
 	rc = ia->ri_async_rc;
 	if (rc)
 	if (rc)
-		goto out;
+		goto put;
 
 
 	return id;
 	return id;
-
+put:
+	module_put(id->device->owner);
 out:
 out:
 	rdma_destroy_id(id);
 	rdma_destroy_id(id);
 	return ERR_PTR(rc);
 	return ERR_PTR(rc);
@@ -493,9 +514,11 @@ rpcrdma_clean_cq(struct ib_cq *cq)
 int
 int
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 {
 {
-	int rc, mem_priv;
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	struct ib_device_attr *devattr = &ia->ri_devattr;
 	struct ib_device_attr *devattr = &ia->ri_devattr;
+	int rc;
+
+	ia->ri_dma_mr = NULL;
 
 
 	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
 	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
 	if (IS_ERR(ia->ri_id)) {
 	if (IS_ERR(ia->ri_id)) {
@@ -519,11 +542,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		goto out3;
 		goto out3;
 	}
 	}
 
 
-	if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
-		ia->ri_have_dma_lkey = 1;
-		ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
-	}
-
 	if (memreg == RPCRDMA_FRMR) {
 	if (memreg == RPCRDMA_FRMR) {
 		/* Requires both frmr reg and local dma lkey */
 		/* Requires both frmr reg and local dma lkey */
 		if (((devattr->device_cap_flags &
 		if (((devattr->device_cap_flags &
@@ -539,42 +557,19 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		if (!ia->ri_device->alloc_fmr) {
 		if (!ia->ri_device->alloc_fmr) {
 			dprintk("RPC:       %s: MTHCAFMR registration "
 			dprintk("RPC:       %s: MTHCAFMR registration "
 				"not supported by HCA\n", __func__);
 				"not supported by HCA\n", __func__);
-			memreg = RPCRDMA_ALLPHYSICAL;
+			goto out3;
 		}
 		}
 	}
 	}
 
 
-	/*
-	 * Optionally obtain an underlying physical identity mapping in
-	 * order to do a memory window-based bind. This base registration
-	 * is protected from remote access - that is enabled only by binding
-	 * for the specific bytes targeted during each RPC operation, and
-	 * revoked after the corresponding completion similar to a storage
-	 * adapter.
-	 */
 	switch (memreg) {
 	switch (memreg) {
 	case RPCRDMA_FRMR:
 	case RPCRDMA_FRMR:
 		ia->ri_ops = &rpcrdma_frwr_memreg_ops;
 		ia->ri_ops = &rpcrdma_frwr_memreg_ops;
 		break;
 		break;
 	case RPCRDMA_ALLPHYSICAL:
 	case RPCRDMA_ALLPHYSICAL:
 		ia->ri_ops = &rpcrdma_physical_memreg_ops;
 		ia->ri_ops = &rpcrdma_physical_memreg_ops;
-		mem_priv = IB_ACCESS_LOCAL_WRITE |
-				IB_ACCESS_REMOTE_WRITE |
-				IB_ACCESS_REMOTE_READ;
-		goto register_setup;
+		break;
 	case RPCRDMA_MTHCAFMR:
 	case RPCRDMA_MTHCAFMR:
 		ia->ri_ops = &rpcrdma_fmr_memreg_ops;
 		ia->ri_ops = &rpcrdma_fmr_memreg_ops;
-		if (ia->ri_have_dma_lkey)
-			break;
-		mem_priv = IB_ACCESS_LOCAL_WRITE;
-	register_setup:
-		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
-		if (IS_ERR(ia->ri_bind_mem)) {
-			printk(KERN_ALERT "%s: ib_get_dma_mr for "
-				"phys register failed with %lX\n",
-				__func__, PTR_ERR(ia->ri_bind_mem));
-			rc = -ENOMEM;
-			goto out3;
-		}
 		break;
 		break;
 	default:
 	default:
 		printk(KERN_ERR "RPC: Unsupported memory "
 		printk(KERN_ERR "RPC: Unsupported memory "
@@ -592,7 +587,7 @@ out3:
 	ib_dealloc_pd(ia->ri_pd);
 	ib_dealloc_pd(ia->ri_pd);
 	ia->ri_pd = NULL;
 	ia->ri_pd = NULL;
 out2:
 out2:
-	rdma_destroy_id(ia->ri_id);
+	rpcrdma_destroy_id(ia->ri_id);
 	ia->ri_id = NULL;
 	ia->ri_id = NULL;
 out1:
 out1:
 	return rc;
 	return rc;
@@ -606,19 +601,11 @@ out1:
 void
 void
 rpcrdma_ia_close(struct rpcrdma_ia *ia)
 rpcrdma_ia_close(struct rpcrdma_ia *ia)
 {
 {
-	int rc;
-
 	dprintk("RPC:       %s: entering\n", __func__);
 	dprintk("RPC:       %s: entering\n", __func__);
-	if (ia->ri_bind_mem != NULL) {
-		rc = ib_dereg_mr(ia->ri_bind_mem);
-		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
-			__func__, rc);
-	}
-
 	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 		if (ia->ri_id->qp)
 		if (ia->ri_id->qp)
 			rdma_destroy_qp(ia->ri_id);
 			rdma_destroy_qp(ia->ri_id);
-		rdma_destroy_id(ia->ri_id);
+		rpcrdma_destroy_id(ia->ri_id);
 		ia->ri_id = NULL;
 		ia->ri_id = NULL;
 	}
 	}
 
 
@@ -639,6 +626,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	struct ib_cq_init_attr cq_attr = {};
 	struct ib_cq_init_attr cq_attr = {};
 	int rc, err;
 	int rc, err;
 
 
+	if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+		dprintk("RPC:       %s: insufficient sge's available\n",
+			__func__);
+		return -ENOMEM;
+	}
+
 	/* check provider's send/recv wr limits */
 	/* check provider's send/recv wr limits */
 	if (cdata->max_requests > devattr->max_qp_wr)
 	if (cdata->max_requests > devattr->max_qp_wr)
 		cdata->max_requests = devattr->max_qp_wr;
 		cdata->max_requests = devattr->max_qp_wr;
@@ -651,21 +644,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	if (rc)
 	if (rc)
 		return rc;
 		return rc;
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
-	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	ep->rep_attr.qp_type = IB_QPT_RC;
 	ep->rep_attr.qp_type = IB_QPT_RC;
 	ep->rep_attr.port_num = ~0;
 	ep->rep_attr.port_num = ~0;
 
 
-	if (cdata->padding) {
-		ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
-						      GFP_KERNEL);
-		if (IS_ERR(ep->rep_padbuf))
-			return PTR_ERR(ep->rep_padbuf);
-	} else
-		ep->rep_padbuf = NULL;
-
 	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
 	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
 		"iovs: send %d recv %d\n",
 		"iovs: send %d recv %d\n",
 		__func__,
 		__func__,
@@ -748,7 +733,8 @@ out2:
 		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 			__func__, err);
 			__func__, err);
 out1:
 out1:
-	rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+	if (ia->ri_dma_mr)
+		ib_dereg_mr(ia->ri_dma_mr);
 	return rc;
 	return rc;
 }
 }
 
 
@@ -775,8 +761,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 		ia->ri_id->qp = NULL;
 		ia->ri_id->qp = NULL;
 	}
 	}
 
 
-	rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-
 	rpcrdma_clean_cq(ep->rep_attr.recv_cq);
 	rpcrdma_clean_cq(ep->rep_attr.recv_cq);
 	rc = ib_destroy_cq(ep->rep_attr.recv_cq);
 	rc = ib_destroy_cq(ep->rep_attr.recv_cq);
 	if (rc)
 	if (rc)
@@ -788,6 +772,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	if (rc)
 	if (rc)
 		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 			__func__, rc);
 			__func__, rc);
+
+	if (ia->ri_dma_mr) {
+		rc = ib_dereg_mr(ia->ri_dma_mr);
+		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
+			__func__, rc);
+	}
 }
 }
 
 
 /*
 /*
@@ -825,7 +815,7 @@ retry:
 		if (ia->ri_device != id->device) {
 		if (ia->ri_device != id->device) {
 			printk("RPC:       %s: can't reconnect on "
 			printk("RPC:       %s: can't reconnect on "
 				"different device!\n", __func__);
 				"different device!\n", __func__);
-			rdma_destroy_id(id);
+			rpcrdma_destroy_id(id);
 			rc = -ENETUNREACH;
 			rc = -ENETUNREACH;
 			goto out;
 			goto out;
 		}
 		}
@@ -834,7 +824,7 @@ retry:
 		if (rc) {
 		if (rc) {
 			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
 			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
 				__func__, rc);
 				__func__, rc);
-			rdma_destroy_id(id);
+			rpcrdma_destroy_id(id);
 			rc = -ENETUNREACH;
 			rc = -ENETUNREACH;
 			goto out;
 			goto out;
 		}
 		}
@@ -845,7 +835,7 @@ retry:
 		write_unlock(&ia->ri_qplock);
 		write_unlock(&ia->ri_qplock);
 
 
 		rdma_destroy_qp(old);
 		rdma_destroy_qp(old);
-		rdma_destroy_id(old);
+		rpcrdma_destroy_id(old);
 	} else {
 	} else {
 		dprintk("RPC:       %s: connecting...\n", __func__);
 		dprintk("RPC:       %s: connecting...\n", __func__);
 		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
 		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -1229,75 +1219,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
 		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
 		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
 }
 }
 
 
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
-				struct ib_mr **mrp, struct ib_sge *iov)
-{
-	struct ib_phys_buf ipb;
-	struct ib_mr *mr;
-	int rc;
-
-	/*
-	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
-	 */
-	iov->addr = ib_dma_map_single(ia->ri_device,
-			va, len, DMA_BIDIRECTIONAL);
-	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-		return -ENOMEM;
-
-	iov->length = len;
-
-	if (ia->ri_have_dma_lkey) {
-		*mrp = NULL;
-		iov->lkey = ia->ri_dma_lkey;
-		return 0;
-	} else if (ia->ri_bind_mem != NULL) {
-		*mrp = NULL;
-		iov->lkey = ia->ri_bind_mem->lkey;
-		return 0;
-	}
-
-	ipb.addr = iov->addr;
-	ipb.size = iov->length;
-	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
-			IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
-	dprintk("RPC:       %s: phys convert: 0x%llx "
-			"registered 0x%llx length %d\n",
-			__func__, (unsigned long long)ipb.addr,
-			(unsigned long long)iov->addr, len);
-
-	if (IS_ERR(mr)) {
-		*mrp = NULL;
-		rc = PTR_ERR(mr);
-		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
-	} else {
-		*mrp = mr;
-		iov->lkey = mr->lkey;
-		rc = 0;
-	}
-
-	return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
-				struct ib_mr *mr, struct ib_sge *iov)
-{
-	int rc;
-
-	ib_dma_unmap_single(ia->ri_device,
-			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
-	if (NULL == mr)
-		return 0;
-
-	rc = ib_dereg_mr(mr);
-	if (rc)
-		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
-	return rc;
-}
-
 /**
 /**
  * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  * @ia: controlling rpcrdma_ia
  * @ia: controlling rpcrdma_ia
@@ -1317,26 +1238,29 @@ struct rpcrdma_regbuf *
 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
 {
 {
 	struct rpcrdma_regbuf *rb;
 	struct rpcrdma_regbuf *rb;
-	int rc;
+	struct ib_sge *iov;
 
 
-	rc = -ENOMEM;
 	rb = kmalloc(sizeof(*rb) + size, flags);
 	rb = kmalloc(sizeof(*rb) + size, flags);
 	if (rb == NULL)
 	if (rb == NULL)
 		goto out;
 		goto out;
 
 
-	rb->rg_size = size;
-	rb->rg_owner = NULL;
-	rc = rpcrdma_register_internal(ia, rb->rg_base, size,
-				       &rb->rg_mr, &rb->rg_iov);
-	if (rc)
+	iov = &rb->rg_iov;
+	iov->addr = ib_dma_map_single(ia->ri_device,
+				      (void *)rb->rg_base, size,
+				      DMA_BIDIRECTIONAL);
+	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
 		goto out_free;
 		goto out_free;
 
 
+	iov->length = size;
+	iov->lkey = ia->ri_dma_lkey;
+	rb->rg_size = size;
+	rb->rg_owner = NULL;
 	return rb;
 	return rb;
 
 
 out_free:
 out_free:
 	kfree(rb);
 	kfree(rb);
 out:
 out:
-	return ERR_PTR(rc);
+	return ERR_PTR(-ENOMEM);
 }
 }
 
 
 /**
 /**
@@ -1347,10 +1271,15 @@ out:
 void
 void
 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 {
 {
-	if (rb) {
-		rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
-		kfree(rb);
-	}
+	struct ib_sge *iov;
+
+	if (!rb)
+		return;
+
+	iov = &rb->rg_iov;
+	ib_dma_unmap_single(ia->ri_device,
+			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
+	kfree(rb);
 }
 }
 
 
 /*
 /*
@@ -1363,9 +1292,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 		struct rpcrdma_ep *ep,
 		struct rpcrdma_ep *ep,
 		struct rpcrdma_req *req)
 		struct rpcrdma_req *req)
 {
 {
+	struct ib_device *device = ia->ri_device;
 	struct ib_send_wr send_wr, *send_wr_fail;
 	struct ib_send_wr send_wr, *send_wr_fail;
 	struct rpcrdma_rep *rep = req->rl_reply;
 	struct rpcrdma_rep *rep = req->rl_reply;
-	int rc;
+	struct ib_sge *iov = req->rl_send_iov;
+	int i, rc;
 
 
 	if (rep) {
 	if (rep) {
 		rc = rpcrdma_ep_post_recv(ia, ep, rep);
 		rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1376,22 +1307,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 
 
 	send_wr.next = NULL;
 	send_wr.next = NULL;
 	send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
 	send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
-	send_wr.sg_list = req->rl_send_iov;
+	send_wr.sg_list = iov;
 	send_wr.num_sge = req->rl_niovs;
 	send_wr.num_sge = req->rl_niovs;
 	send_wr.opcode = IB_WR_SEND;
 	send_wr.opcode = IB_WR_SEND;
-	if (send_wr.num_sge == 4)	/* no need to sync any pad (constant) */
-		ib_dma_sync_single_for_device(ia->ri_device,
-					      req->rl_send_iov[3].addr,
-					      req->rl_send_iov[3].length,
-					      DMA_TO_DEVICE);
-	ib_dma_sync_single_for_device(ia->ri_device,
-				      req->rl_send_iov[1].addr,
-				      req->rl_send_iov[1].length,
-				      DMA_TO_DEVICE);
-	ib_dma_sync_single_for_device(ia->ri_device,
-				      req->rl_send_iov[0].addr,
-				      req->rl_send_iov[0].length,
-				      DMA_TO_DEVICE);
+
+	for (i = 0; i < send_wr.num_sge; i++)
+		ib_dma_sync_single_for_device(device, iov[i].addr,
+					      iov[i].length, DMA_TO_DEVICE);
+	dprintk("RPC:       %s: posting %d s/g entries\n",
+		__func__, send_wr.num_sge);
 
 
 	if (DECR_CQCOUNT(ep) > 0)
 	if (DECR_CQCOUNT(ep) > 0)
 		send_wr.send_flags = 0;
 		send_wr.send_flags = 0;

+ 13 - 14
net/sunrpc/xprtrdma/xprt_rdma.h

@@ -64,9 +64,8 @@ struct rpcrdma_ia {
 	struct ib_device	*ri_device;
 	struct ib_device	*ri_device;
 	struct rdma_cm_id 	*ri_id;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
 	struct ib_pd		*ri_pd;
-	struct ib_mr		*ri_bind_mem;
+	struct ib_mr		*ri_dma_mr;
 	u32			ri_dma_lkey;
 	u32			ri_dma_lkey;
-	int			ri_have_dma_lkey;
 	struct completion	ri_done;
 	struct completion	ri_done;
 	int			ri_async_rc;
 	int			ri_async_rc;
 	unsigned int		ri_max_frmr_depth;
 	unsigned int		ri_max_frmr_depth;
@@ -88,7 +87,6 @@ struct rpcrdma_ep {
 	int			rep_connected;
 	int			rep_connected;
 	struct ib_qp_init_attr	rep_attr;
 	struct ib_qp_init_attr	rep_attr;
 	wait_queue_head_t 	rep_connect_wait;
 	wait_queue_head_t 	rep_connect_wait;
-	struct rpcrdma_regbuf	*rep_padbuf;
 	struct rdma_conn_param	rep_remote_cma;
 	struct rdma_conn_param	rep_remote_cma;
 	struct sockaddr_storage	rep_remote_addr;
 	struct sockaddr_storage	rep_remote_addr;
 	struct delayed_work	rep_connect_worker;
 	struct delayed_work	rep_connect_worker;
@@ -118,7 +116,6 @@ struct rpcrdma_ep {
 struct rpcrdma_regbuf {
 struct rpcrdma_regbuf {
 	size_t			rg_size;
 	size_t			rg_size;
 	struct rpcrdma_req	*rg_owner;
 	struct rpcrdma_req	*rg_owner;
-	struct ib_mr		*rg_mr;
 	struct ib_sge		rg_iov;
 	struct ib_sge		rg_iov;
 	__be32			rg_base[0] __attribute__ ((aligned(256)));
 	__be32			rg_base[0] __attribute__ ((aligned(256)));
 };
 };
@@ -164,8 +161,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  * struct rpcrdma_buffer. N is the max number of outstanding requests.
  * struct rpcrdma_buffer. N is the max number of outstanding requests.
  */
  */
 
 
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS	(64)	/* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS	((1 * 1024 * 1024) / PAGE_SIZE)
 #define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
 #define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
 
 
 struct rpcrdma_buffer;
 struct rpcrdma_buffer;
@@ -257,16 +253,18 @@ struct rpcrdma_mr_seg {		/* chunk descriptors */
 	char		*mr_offset;	/* kva if no page, else offset */
 	char		*mr_offset;	/* kva if no page, else offset */
 };
 };
 
 
+#define RPCRDMA_MAX_IOVS	(2)
+
 struct rpcrdma_req {
 struct rpcrdma_req {
-	unsigned int	rl_niovs;	/* 0, 2 or 4 */
-	unsigned int	rl_nchunks;	/* non-zero if chunks */
-	unsigned int	rl_connect_cookie;	/* retry detection */
-	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+	unsigned int		rl_niovs;
+	unsigned int		rl_nchunks;
+	unsigned int		rl_connect_cookie;
+	struct rpcrdma_buffer	*rl_buffer;
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
-	struct ib_sge	rl_send_iov[4];	/* for active requests */
-	struct rpcrdma_regbuf *rl_rdmabuf;
-	struct rpcrdma_regbuf *rl_sendbuf;
-	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+	struct ib_sge		rl_send_iov[RPCRDMA_MAX_IOVS];
+	struct rpcrdma_regbuf	*rl_rdmabuf;
+	struct rpcrdma_regbuf	*rl_sendbuf;
+	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
 };
 };
 
 
 static inline struct rpcrdma_req *
 static inline struct rpcrdma_req *
@@ -341,6 +339,7 @@ struct rpcrdma_stats {
 	unsigned long		hardway_register_count;
 	unsigned long		hardway_register_count;
 	unsigned long		failed_marshal_count;
 	unsigned long		failed_marshal_count;
 	unsigned long		bad_reply_count;
 	unsigned long		bad_reply_count;
+	unsigned long		nomsg_call_count;
 };
 };
 
 
 /*
 /*

+ 11 - 7
net/sunrpc/xprtsock.c

@@ -822,6 +822,8 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	if (atomic_read(&transport->xprt.swapper))
 	if (atomic_read(&transport->xprt.swapper))
 		sk_clear_memalloc(sk);
 		sk_clear_memalloc(sk);
 
 
+	kernel_sock_shutdown(sock, SHUT_RDWR);
+
 	write_lock_bh(&sk->sk_callback_lock);
 	write_lock_bh(&sk->sk_callback_lock);
 	transport->inet = NULL;
 	transport->inet = NULL;
 	transport->sock = NULL;
 	transport->sock = NULL;
@@ -829,6 +831,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	sk->sk_user_data = NULL;
 	sk->sk_user_data = NULL;
 
 
 	xs_restore_old_callbacks(transport, sk);
 	xs_restore_old_callbacks(transport, sk);
+	xprt_clear_connected(xprt);
 	write_unlock_bh(&sk->sk_callback_lock);
 	write_unlock_bh(&sk->sk_callback_lock);
 	xs_sock_reset_connection_flags(xprt);
 	xs_sock_reset_connection_flags(xprt);
 
 
@@ -1866,7 +1869,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
 		sk->sk_data_ready = xs_local_data_ready;
 		sk->sk_data_ready = xs_local_data_ready;
 		sk->sk_write_space = xs_udp_write_space;
 		sk->sk_write_space = xs_udp_write_space;
 		sk->sk_error_report = xs_error_report;
 		sk->sk_error_report = xs_error_report;
-		sk->sk_allocation = GFP_ATOMIC;
+		sk->sk_allocation = GFP_NOIO;
 
 
 		xprt_clear_connected(xprt);
 		xprt_clear_connected(xprt);
 
 
@@ -2051,7 +2054,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_user_data = xprt;
 		sk->sk_user_data = xprt;
 		sk->sk_data_ready = xs_udp_data_ready;
 		sk->sk_data_ready = xs_udp_data_ready;
 		sk->sk_write_space = xs_udp_write_space;
 		sk->sk_write_space = xs_udp_write_space;
-		sk->sk_allocation = GFP_ATOMIC;
+		sk->sk_allocation = GFP_NOIO;
 
 
 		xprt_set_connected(xprt);
 		xprt_set_connected(xprt);
 
 
@@ -2153,7 +2156,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_state_change = xs_tcp_state_change;
 		sk->sk_state_change = xs_tcp_state_change;
 		sk->sk_write_space = xs_tcp_write_space;
 		sk->sk_write_space = xs_tcp_write_space;
 		sk->sk_error_report = xs_error_report;
 		sk->sk_error_report = xs_error_report;
-		sk->sk_allocation = GFP_ATOMIC;
+		sk->sk_allocation = GFP_NOIO;
 
 
 		/* socket options */
 		/* socket options */
 		sock_reset_flag(sk, SOCK_LINGER);
 		sock_reset_flag(sk, SOCK_LINGER);
@@ -2279,13 +2282,14 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 
 
 	WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
 	WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
 
 
-	/* Start by resetting any existing state */
-	xs_reset_transport(transport);
-
-	if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
+	if (transport->sock != NULL) {
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
 				"seconds\n",
 				"seconds\n",
 				xprt, xprt->reestablish_timeout / HZ);
 				xprt, xprt->reestablish_timeout / HZ);
+
+		/* Start by resetting any existing state */
+		xs_reset_transport(transport);
+
 		queue_delayed_work(rpciod_workqueue,
 		queue_delayed_work(rpciod_workqueue,
 				   &transport->connect_worker,
 				   &transport->connect_worker,
 				   xprt->reestablish_timeout);
 				   xprt->reestablish_timeout);