Browse Source

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6:
  fs: simplify iget & friends
  fs: pull inode->i_lock up out of writeback_single_inode
  fs: rename inode_lock to inode_hash_lock
  fs: move i_wb_list out from under inode_lock
  fs: move i_sb_list out from under inode_lock
  fs: remove inode_lock from iput_final and prune_icache
  fs: Lock the inode LRU list separately
  fs: factor inode disposal
  fs: protect inode->i_state with inode->i_lock
  autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd()
  autofs4 - remove autofs4_lock
  autofs4 - fix d_manage() return on rcu-walk
  autofs4 - fix autofs4_expire_indirect() traversal
  autofs4 - fix dentry leak in autofs4_expire_direct()
  autofs4 - reinstate last used update on access
  vfs - check non-mountpoint dentry might block in __follow_mount_rcu()
Linus Torvalds 14 years ago
parent
commit
d39dd11c3e

+ 1 - 1
Documentation/filesystems/Locking

@@ -128,7 +128,7 @@ alloc_inode:
 destroy_inode:
 destroy_inode:
 dirty_inode:				(must not sleep)
 dirty_inode:				(must not sleep)
 write_inode:
 write_inode:
-drop_inode:				!!!inode_lock!!!
+drop_inode:				!!!inode->i_lock!!!
 evict_inode:
 evict_inode:
 put_super:		write
 put_super:		write
 write_super:		read
 write_super:		read

+ 11 - 5
Documentation/filesystems/porting

@@ -298,11 +298,14 @@ be used instead.  It gets called whenever the inode is evicted, whether it has
 remaining links or not.  Caller does *not* evict the pagecache or inode-associated
 remaining links or not.  Caller does *not* evict the pagecache or inode-associated
 metadata buffers; getting rid of those is responsibility of method, as it had
 metadata buffers; getting rid of those is responsibility of method, as it had
 been for ->delete_inode().
 been for ->delete_inode().
-	->drop_inode() returns int now; it's called on final iput() with inode_lock
-held and it returns true if filesystems wants the inode to be dropped.  As before,
-generic_drop_inode() is still the default and it's been updated appropriately.
-generic_delete_inode() is also alive and it consists simply of return 1.  Note that
-all actual eviction work is done by caller after ->drop_inode() returns.
+
+	->drop_inode() returns int now; it's called on final iput() with
+inode->i_lock held and it returns true if filesystems wants the inode to be
+dropped.  As before, generic_drop_inode() is still the default and it's been
+updated appropriately.  generic_delete_inode() is also alive and it consists
+simply of return 1.  Note that all actual eviction work is done by caller after
+->drop_inode() returns.
+
 	clear_inode() is gone; use end_writeback() instead.  As before, it must
 	clear_inode() is gone; use end_writeback() instead.  As before, it must
 be called exactly once on each call of ->evict_inode() (as it used to be for
 be called exactly once on each call of ->evict_inode() (as it used to be for
 each call of ->delete_inode()).  Unlike before, if you are using inode-associated
 each call of ->delete_inode()).  Unlike before, if you are using inode-associated
@@ -395,6 +398,9 @@ Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set,
 so the i_size should not change when hole punching, even when puching the end of
 so the i_size should not change when hole punching, even when puching the end of
 a file off.
 a file off.
 
 
+--
+[mandatory]
+
 --
 --
 [mandatory]
 [mandatory]
 	->get_sb() is gone.  Switch to use of ->mount().  Typically it's just
 	->get_sb() is gone.  Switch to use of ->mount().  Typically it's just

+ 1 - 1
Documentation/filesystems/vfs.txt

@@ -254,7 +254,7 @@ or bottom half).
 	should be synchronous or not, not all filesystems check this flag.
 	should be synchronous or not, not all filesystems check this flag.
 
 
   drop_inode: called when the last access to the inode is dropped,
   drop_inode: called when the last access to the inode is dropped,
-	with the inode_lock spinlock held.
+	with the inode->i_lock spinlock held.
 
 
 	This method should be either NULL (normal UNIX filesystem
 	This method should be either NULL (normal UNIX filesystem
 	semantics) or "generic_delete_inode" (for filesystems that do not
 	semantics) or "generic_delete_inode" (for filesystems that do not

+ 0 - 2
fs/autofs4/autofs_i.h

@@ -61,8 +61,6 @@ do {							\
 		current->pid, __func__, ##args);	\
 		current->pid, __func__, ##args);	\
 } while (0)
 } while (0)
 
 
-extern spinlock_t autofs4_lock;
-
 /* Unified info structure.  This is pointed to by both the dentry and
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
    structure.  It holds a reference to the dentry, so dentries are never

+ 4 - 0
fs/autofs4/dev-ioctl.c

@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		return -EBUSY;
 		return -EBUSY;
 	} else {
 	} else {
 		struct file *pipe = fget(pipefd);
 		struct file *pipe = fget(pipefd);
+		if (!pipe) {
+			err = -EBADF;
+			goto out;
+		}
 		if (!pipe->f_op || !pipe->f_op->write) {
 		if (!pipe->f_op || !pipe->f_op->write) {
 			err = -EPIPE;
 			err = -EPIPE;
 			fput(pipe);
 			fput(pipe);

+ 63 - 21
fs/autofs4/expire.c

@@ -86,19 +86,71 @@ done:
 	return status;
 	return status;
 }
 }
 
 
+/*
+ * Calculate and dget next entry in the subdirs list under root.
+ */
+static struct dentry *get_next_positive_subdir(struct dentry *prev,
+						struct dentry *root)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct list_head *next;
+	struct dentry *p, *q;
+
+	spin_lock(&sbi->lookup_lock);
+
+	if (prev == NULL) {
+		spin_lock(&root->d_lock);
+		prev = dget_dlock(root);
+		next = prev->d_subdirs.next;
+		p = prev;
+		goto start;
+	}
+
+	p = prev;
+	spin_lock(&p->d_lock);
+again:
+	next = p->d_u.d_child.next;
+start:
+	if (next == &root->d_subdirs) {
+		spin_unlock(&p->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		dput(prev);
+		return NULL;
+	}
+
+	q = list_entry(next, struct dentry, d_u.d_child);
+
+	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Negative dentry - try next */
+	if (!simple_positive(q)) {
+		spin_unlock(&p->d_lock);
+		p = q;
+		goto again;
+	}
+	dget_dlock(q);
+	spin_unlock(&q->d_lock);
+	spin_unlock(&p->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	dput(prev);
+
+	return q;
+}
+
 /*
 /*
  * Calculate and dget next entry in top down tree traversal.
  * Calculate and dget next entry in top down tree traversal.
  */
  */
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
 						struct dentry *root)
 						struct dentry *root)
 {
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
 	struct list_head *next;
 	struct list_head *next;
 	struct dentry *p, *ret;
 	struct dentry *p, *ret;
 
 
 	if (prev == NULL)
 	if (prev == NULL)
 		return dget(root);
 		return dget(root);
 
 
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 relock:
 relock:
 	p = prev;
 	p = prev;
 	spin_lock(&p->d_lock);
 	spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
 
 
 			if (p == root) {
 			if (p == root) {
 				spin_unlock(&p->d_lock);
 				spin_unlock(&p->d_lock);
-				spin_unlock(&autofs4_lock);
+				spin_unlock(&sbi->lookup_lock);
 				dput(prev);
 				dput(prev);
 				return NULL;
 				return NULL;
 			}
 			}
@@ -140,7 +192,7 @@ again:
 	dget_dlock(ret);
 	dget_dlock(ret);
 	spin_unlock(&ret->d_lock);
 	spin_unlock(&ret->d_lock);
 	spin_unlock(&p->d_lock);
 	spin_unlock(&p->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 
 	dput(prev);
 	dput(prev);
 
 
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(root);
 	ino = autofs4_dentry_ino(root);
 	/* No point expiring a pending mount */
 	/* No point expiring a pending mount */
-	if (ino->flags & AUTOFS_INF_PENDING) {
-		spin_unlock(&sbi->fs_lock);
-		return NULL;
-	}
-	managed_dentry_set_transit(root);
+	if (ino->flags & AUTOFS_INF_PENDING)
+		goto out;
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
 		struct autofs_info *ino = autofs4_dentry_ino(root);
 		ino->flags |= AUTOFS_INF_EXPIRING;
 		ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 		spin_unlock(&sbi->fs_lock);
 		spin_unlock(&sbi->fs_lock);
 		return root;
 		return root;
 	}
 	}
-	managed_dentry_clear_transit(root);
+out:
 	spin_unlock(&sbi->fs_lock);
 	spin_unlock(&sbi->fs_lock);
 	dput(root);
 	dput(root);
 
 
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	timeout = sbi->exp_timeout;
 	timeout = sbi->exp_timeout;
 
 
 	dentry = NULL;
 	dentry = NULL;
-	while ((dentry = get_next_positive_dentry(dentry, root))) {
+	while ((dentry = get_next_positive_subdir(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
 		ino = autofs4_dentry_ino(dentry);
 		/* No point expiring a pending mount */
 		/* No point expiring a pending mount */
 		if (ino->flags & AUTOFS_INF_PENDING)
 		if (ino->flags & AUTOFS_INF_PENDING)
-			goto cont;
-		managed_dentry_set_transit(dentry);
+			goto next;
 
 
 		/*
 		/*
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			}
 			}
 		}
 		}
 next:
 next:
-		managed_dentry_clear_transit(dentry);
-cont:
 		spin_unlock(&sbi->fs_lock);
 		spin_unlock(&sbi->fs_lock);
 	}
 	}
 	return NULL;
 	return NULL;
@@ -415,13 +461,13 @@ found:
 	ino->flags |= AUTOFS_INF_EXPIRING;
 	ino->flags |= AUTOFS_INF_EXPIRING;
 	init_completion(&ino->expire_complete);
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_unlock(&sbi->fs_lock);
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 	spin_lock(&expired->d_parent->d_lock);
 	spin_lock(&expired->d_parent->d_lock);
 	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
 	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	spin_unlock(&expired->d_lock);
 	spin_unlock(&expired->d_lock);
 	spin_unlock(&expired->d_parent->d_lock);
 	spin_unlock(&expired->d_parent->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 	return expired;
 	return expired;
 }
 }
 
 
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
-	if (!d_unhashed(dentry))
-		managed_dentry_clear_transit(dentry);
 	complete_all(&ino->expire_complete);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_unlock(&sbi->fs_lock);
 
 
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		spin_lock(&sbi->fs_lock);
 		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
 		spin_lock(&dentry->d_lock);
 		spin_lock(&dentry->d_lock);
-		if (ret)
-			__managed_dentry_clear_transit(dentry);
-		else {
+		if (!ret) {
 			if ((IS_ROOT(dentry) ||
 			if ((IS_ROOT(dentry) ||
 			    (autofs_type_indirect(sbi->type) &&
 			    (autofs_type_indirect(sbi->type) &&
 			     IS_ROOT(dentry->d_parent))) &&
 			     IS_ROOT(dentry->d_parent))) &&

+ 21 - 41
fs/autofs4/root.c

@@ -23,8 +23,6 @@
 
 
 #include "autofs_i.h"
 #include "autofs_i.h"
 
 
-DEFINE_SPINLOCK(autofs4_lock);
-
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * autofs file system so just let the libfs routines handle
 	 * autofs file system so just let the libfs routines handle
 	 * it.
 	 * it.
 	 */
 	 */
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	spin_lock(&dentry->d_lock);
 	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
+		spin_unlock(&sbi->lookup_lock);
 		return -ENOENT;
 		return -ENOENT;
 	}
 	}
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 
 out:
 out:
 	return dcache_dir_open(inode, file);
 	return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 	struct list_head *p, *head;
 
 
-	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
 	head = &sbi->active_list;
 	list_for_each(p, head) {
 	list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 			dget_dlock(active);
 			dget_dlock(active);
 			spin_unlock(&active->d_lock);
 			spin_unlock(&active->d_lock);
 			spin_unlock(&sbi->lookup_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&autofs4_lock);
 			return active;
 			return active;
 		}
 		}
 next:
 next:
 		spin_unlock(&active->d_lock);
 		spin_unlock(&active->d_lock);
 	}
 	}
 	spin_unlock(&sbi->lookup_lock);
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&autofs4_lock);
 
 
 	return NULL;
 	return NULL;
 }
 }
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 	struct list_head *p, *head;
 
 
-	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->expiring_list;
 	head = &sbi->expiring_list;
 	list_for_each(p, head) {
 	list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 			dget_dlock(expiring);
 			dget_dlock(expiring);
 			spin_unlock(&expiring->d_lock);
 			spin_unlock(&expiring->d_lock);
 			spin_unlock(&sbi->lookup_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&autofs4_lock);
 			return expiring;
 			return expiring;
 		}
 		}
 next:
 next:
 		spin_unlock(&expiring->d_lock);
 		spin_unlock(&expiring->d_lock);
 	}
 	}
 	spin_unlock(&sbi->lookup_lock);
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&autofs4_lock);
 
 
 	return NULL;
 	return NULL;
 }
 }
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
 {
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	int status;
+	int status = 0;
 
 
 	if (ino->flags & AUTOFS_INF_PENDING) {
 	if (ino->flags & AUTOFS_INF_PENDING) {
 		DPRINTK("waiting for mount name=%.*s",
 		DPRINTK("waiting for mount name=%.*s",
 			dentry->d_name.len, dentry->d_name.name);
 			dentry->d_name.len, dentry->d_name.name);
 		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 		DPRINTK("mount wait done status=%d", status);
 		DPRINTK("mount wait done status=%d", status);
-		ino->last_used = jiffies;
-		return status;
 	}
 	}
-	return 0;
+	ino->last_used = jiffies;
+	return status;
 }
 }
 
 
 static int do_expire_wait(struct dentry *dentry)
 static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
 	 */
 	 */
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 		struct dentry *parent = dentry->d_parent;
 		struct dentry *parent = dentry->d_parent;
+		struct autofs_info *ino;
 		struct dentry *new = d_lookup(parent, &dentry->d_name);
 		struct dentry *new = d_lookup(parent, &dentry->d_name);
 		if (!new)
 		if (!new)
 			return NULL;
 			return NULL;
+		ino = autofs4_dentry_ino(new);
+		ino->last_used = jiffies;
 		dput(path->dentry);
 		dput(path->dentry);
 		path->dentry = new;
 		path->dentry = new;
 	}
 	}
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 	DPRINTK("dentry=%p %.*s",
 	DPRINTK("dentry=%p %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
 
-	/*
-	 * Someone may have manually umounted this or it was a submount
-	 * that has gone away.
-	 */
-	spin_lock(&dentry->d_lock);
-	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-		if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-		     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-			__managed_dentry_set_transit(path->dentry);
-	}
-	spin_unlock(&dentry->d_lock);
-
 	/* The daemon never triggers a mount. */
 	/* The daemon never triggers a mount. */
 	if (autofs4_oz_mode(sbi))
 	if (autofs4_oz_mode(sbi))
 		return NULL;
 		return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 done:
 done:
 	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
 	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
 		/*
 		/*
-		 * Any needed mounting has been completed and the path updated
-		 * so turn this into a normal dentry so we don't continually
-		 * call ->d_automount() and ->d_manage().
-		 */
-		spin_lock(&dentry->d_lock);
-		__managed_dentry_clear_transit(dentry);
-		/*
+		 * Any needed mounting has been completed and the path
+		 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
+		 * call ->d_automount() on rootless multi-mounts since
+		 * it can lead to an incorrect ELOOP error return.
+		 *
 		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
 		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
 		 * symlinks as in all other cases the dentry will be covered by
 		 * symlinks as in all other cases the dentry will be covered by
 		 * an actual mount so ->d_automount() won't be called during
 		 * an actual mount so ->d_automount() won't be called during
 		 * the follow.
 		 * the follow.
 		 */
 		 */
+		spin_lock(&dentry->d_lock);
 		if ((!d_mountpoint(dentry) &&
 		if ((!d_mountpoint(dentry) &&
 		    !list_empty(&dentry->d_subdirs)) ||
 		    !list_empty(&dentry->d_subdirs)) ||
 		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
 		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -455,6 +436,8 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 
 
 	/* The daemon never waits. */
 	/* The daemon never waits. */
 	if (autofs4_oz_mode(sbi)) {
 	if (autofs4_oz_mode(sbi)) {
+		if (rcu_walk)
+			return 0;
 		if (!d_mountpoint(dentry))
 		if (!d_mountpoint(dentry))
 			return -EISDIR;
 			return -EISDIR;
 		return 0;
 		return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 
 	dir->i_mtime = CURRENT_TIME;
 	dir->i_mtime = CURRENT_TIME;
 
 
-	spin_lock(&autofs4_lock);
-	autofs4_add_expiring(dentry);
+	spin_lock(&sbi->lookup_lock);
+	__autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi))
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 		return -EACCES;
 
 
-	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	spin_lock(&dentry->d_lock);
 	if (!list_empty(&dentry->d_subdirs)) {
 	if (!list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&sbi->lookup_lock);
 		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&autofs4_lock);
 		return -ENOTEMPTY;
 		return -ENOTEMPTY;
 	}
 	}
 	__autofs4_add_expiring(dentry);
 	__autofs4_add_expiring(dentry);
-	spin_unlock(&sbi->lookup_lock);
 	__d_drop(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 
 	if (sbi->version < 5)
 	if (sbi->version < 5)
 		autofs_clear_leaf_automount_flags(dentry);
 		autofs_clear_leaf_automount_flags(dentry);

+ 3 - 3
fs/autofs4/waitq.c

@@ -197,12 +197,12 @@ rename_retry:
 
 
 	seq = read_seqbegin(&rename_lock);
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
 	rcu_read_lock();
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->fs_lock);
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
 		len += tmp->d_name.len + 1;
 		len += tmp->d_name.len + 1;
 
 
 	if (!len || --len > NAME_MAX) {
 	if (!len || --len > NAME_MAX) {
-		spin_unlock(&autofs4_lock);
+		spin_unlock(&sbi->fs_lock);
 		rcu_read_unlock();
 		rcu_read_unlock();
 		if (read_seqretry(&rename_lock, seq))
 		if (read_seqretry(&rename_lock, seq))
 			goto rename_retry;
 			goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
 		p -= tmp->d_name.len;
 		p -= tmp->d_name.len;
 		strncpy(p, tmp->d_name.name, tmp->d_name.len);
 		strncpy(p, tmp->d_name.name, tmp->d_name.len);
 	}
 	}
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->fs_lock);
 	rcu_read_unlock();
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 		goto rename_retry;

+ 4 - 2
fs/block_dev.c

@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
 static void bdev_inode_switch_bdi(struct inode *inode,
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 			struct backing_dev_info *dst)
 {
 {
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
 }
 }
 
 
 static sector_t max_block(struct block_device *bdev)
 static sector_t max_block(struct block_device *bdev)

+ 1 - 1
fs/buffer.c

@@ -1138,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  * inode list.
  * inode list.
  *
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
  */
  */
 void mark_buffer_dirty(struct buffer_head *bh)
 void mark_buffer_dirty(struct buffer_head *bh)
 {
 {

+ 11 - 7
fs/drop_caches.c

@@ -8,6 +8,7 @@
 #include <linux/writeback.h>
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
 #include <linux/gfp.h>
+#include "internal.h"
 
 
 /* A global variable is a bit ugly, but it keeps the code simple */
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
 {
 	struct inode *inode, *toput_inode = NULL;
 	struct inode *inode, *toput_inode = NULL;
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-		if (inode->i_mapping->nrpages == 0)
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (inode->i_mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 		__iget(inode);
 		__iget(inode);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		iput(toput_inode);
 		toput_inode = inode;
 		toput_inode = inode;
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 	iput(toput_inode);
 	iput(toput_inode);
 }
 }
 
 

+ 91 - 50
fs/fs-writeback.c

@@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 	spin_unlock_bh(&bdi->wb_lock);
 	spin_unlock_bh(&bdi->wb_lock);
 }
 }
 
 
+/*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+	spin_lock(&inode_wb_list_lock);
+	list_del_init(&inode->i_wb_list);
+	spin_unlock(&inode_wb_list_lock);
+}
+
+
 /*
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
  * furthest end of its superblock's dirty-inode list.
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
 {
 {
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
 
+	assert_spin_locked(&inode_wb_list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 		struct inode *tail;
 
 
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
 {
 {
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
 
+	assert_spin_locked(&inode_wb_list_lock);
 	list_move(&inode->i_wb_list, &wb->b_more_io);
 	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 }
 
 
 static void inode_sync_complete(struct inode *inode)
 static void inode_sync_complete(struct inode *inode)
 {
 {
 	/*
 	/*
-	 * Prevent speculative execution through spin_unlock(&inode_lock);
+	 * Prevent speculative execution through
+	 * spin_unlock(&inode_wb_list_lock);
 	 */
 	 */
+
 	smp_mb();
 	smp_mb();
 	wake_up_bit(&inode->i_state, __I_SYNC);
 	wake_up_bit(&inode->i_state, __I_SYNC);
 }
 }
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
  */
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
 {
+	assert_spin_locked(&inode_wb_list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 }
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
 	wait_queue_head_t *wqh;
 	wait_queue_head_t *wqh;
 
 
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-	 while (inode->i_state & I_SYNC) {
-		spin_unlock(&inode_lock);
+	while (inode->i_state & I_SYNC) {
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_wb_list_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
+		spin_lock(&inode->i_lock);
 	}
 	}
 }
 }
 
 
 /*
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.  Either the
- * caller has ref on the inode (either via __iget or via syscall against an fd)
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+ * inode->i_lock.  Either the caller has an active reference on the inode or
+ * the inode has I_WILL_FREE set.
  *
  *
  * If `wait' is set, wait on the writeout.
  * If `wait' is set, wait on the writeout.
  *
  *
  * The whole writeout design is quite complex and fragile.  We want to avoid
  * The whole writeout design is quite complex and fragile.  We want to avoid
  * starvation of particular inodes when others are being redirtied, prevent
  * starvation of particular inodes when others are being redirtied, prevent
  * livelocks, etc.
  * livelocks, etc.
- *
- * Called under inode_lock.
  */
  */
 static int
 static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	unsigned dirty;
 	unsigned dirty;
 	int ret;
 	int ret;
 
 
+	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&inode->i_lock);
+
 	if (!atomic_read(&inode->i_count))
 	if (!atomic_read(&inode->i_count))
 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 	else
 	else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	inode->i_state |= I_SYNC;
 	inode->i_state |= I_SYNC;
 	inode->i_state &= ~I_DIRTY_PAGES;
 	inode->i_state &= ~I_DIRTY_PAGES;
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
 
 
 	ret = do_writepages(mapping, wbc);
 	ret = do_writepages(mapping, wbc);
 
 
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * due to delalloc, clear dirty metadata flags right before
 	 * due to delalloc, clear dirty metadata flags right before
 	 * write_inode()
 	 * write_inode()
 	 */
 	 */
-	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	dirty = inode->i_state & I_DIRTY;
 	dirty = inode->i_state & I_DIRTY;
 	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
 	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		int err = write_inode(inode, wbc);
 		int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			ret = err;
 			ret = err;
 	}
 	}
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 	if (!(inode->i_state & I_FREEING)) {
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		 * kind does not need peridic writeout yet, and for the latter
 		 * kind does not need peridic writeout yet, and for the latter
 		 * kind writeout is handled by the freer.
 		 * kind writeout is handled by the freer.
 		 */
 		 */
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			requeue_io(inode);
 			requeue_io(inode);
 			continue;
 			continue;
 		}
 		}
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
 		 * This keeps sync from extra jobs and livelock.
 		 */
 		 */
-		if (inode_dirtied_after(inode, wbc->wb_start))
+		if (inode_dirtied_after(inode, wbc->wb_start)) {
+			spin_unlock(&inode->i_lock);
 			return 1;
 			return 1;
+		}
 
 
 		__iget(inode);
 		__iget(inode);
+
 		pages_skipped = wbc->pages_skipped;
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
 		writeback_single_inode(inode, wbc);
 		if (wbc->pages_skipped != pages_skipped) {
 		if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			 */
 			 */
 			redirty_tail(inode);
 			redirty_tail(inode);
 		}
 		}
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_wb_list_lock);
 		iput(inode);
 		iput(inode);
 		cond_resched();
 		cond_resched();
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
 		if (wbc->nr_to_write <= 0) {
 		if (wbc->nr_to_write <= 0) {
 			wbc->more_io = 1;
 			wbc->more_io = 1;
 			return 1;
 			return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 
 
 	if (!wbc->wb_start)
 	if (!wbc->wb_start)
 		wbc->wb_start = jiffies; /* livelock avoidance */
 		wbc->wb_start = jiffies; /* livelock avoidance */
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 		queue_io(wb, wbc->older_than_this);
 
 
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 		if (ret)
 		if (ret)
 			break;
 			break;
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 	/* Leave any unwritten inodes on b_io */
 	/* Leave any unwritten inodes on b_io */
 }
 }
 
 
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
 {
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 		queue_io(wb, wbc->older_than_this);
 	writeback_sb_inodes(sb, wb, wbc, true);
 	writeback_sb_inodes(sb, wb, wbc, true);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 }
 }
 
 
 /*
 /*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * become available for writeback. Otherwise
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 * we'll just busyloop.
 		 */
 		 */
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
 		if (!list_empty(&wb->b_more_io))  {
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
 			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
+			spin_lock(&inode->i_lock);
 			inode_wait_for_writeback(inode);
 			inode_wait_for_writeback(inode);
+			spin_unlock(&inode->i_lock);
 		}
 		}
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_wb_list_lock);
 	}
 	}
 
 
 	return wrote;
 	return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 {
 {
 	struct super_block *sb = inode->i_sb;
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
 	struct backing_dev_info *bdi = NULL;
-	bool wakeup_bdi = false;
 
 
 	/*
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if (unlikely(block_dump))
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 		block_dump___mark_inode_dirty(inode);
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	if ((inode->i_state & flags) != flags) {
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 		const int was_dirty = inode->i_state & I_DIRTY;
 
 
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * superblock list, based upon its state.
 		 * superblock list, based upon its state.
 		 */
 		 */
 		if (inode->i_state & I_SYNC)
 		if (inode->i_state & I_SYNC)
-			goto out;
+			goto out_unlock_inode;
 
 
 		/*
 		/*
 		 * Only add valid (hashed) inodes to the superblock's
 		 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 */
 		 */
 		if (!S_ISBLK(inode->i_mode)) {
 		if (!S_ISBLK(inode->i_mode)) {
 			if (inode_unhashed(inode))
 			if (inode_unhashed(inode))
-				goto out;
+				goto out_unlock_inode;
 		}
 		}
 		if (inode->i_state & I_FREEING)
 		if (inode->i_state & I_FREEING)
-			goto out;
+			goto out_unlock_inode;
 
 
 		/*
 		/*
 		 * If the inode was already on b_dirty/b_io/b_more_io, don't
 		 * If the inode was already on b_dirty/b_io/b_more_io, don't
 		 * reposition it (that would break b_dirty time-ordering).
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		 */
 		if (!was_dirty) {
 		if (!was_dirty) {
+			bool wakeup_bdi = false;
 			bdi = inode_to_bdi(inode);
 			bdi = inode_to_bdi(inode);
 
 
 			if (bdi_cap_writeback_dirty(bdi)) {
 			if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 					wakeup_bdi = true;
 					wakeup_bdi = true;
 			}
 			}
 
 
+			spin_unlock(&inode->i_lock);
+			spin_lock(&inode_wb_list_lock);
 			inode->dirtied_when = jiffies;
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			spin_unlock(&inode_wb_list_lock);
+
+			if (wakeup_bdi)
+				bdi_wakeup_thread_delayed(bdi);
+			return;
 		}
 		}
 	}
 	}
-out:
-	spin_unlock(&inode_lock);
+out_unlock_inode:
+	spin_unlock(&inode->i_lock);
 
 
-	if (wakeup_bdi)
-		bdi_wakeup_thread_delayed(bdi);
 }
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 EXPORT_SYMBOL(__mark_inode_dirty);
 
 
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	 */
 	 */
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 
 
 	/*
 	/*
 	 * Data integrity sync. Must wait for all pages under writeback,
 	 * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
 	 * we still have to wait for that writeout.
 	 * we still have to wait for that writeout.
 	 */
 	 */
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		struct address_space *mapping;
+		struct address_space *mapping = inode->i_mapping;
 
 
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-		mapping = inode->i_mapping;
-		if (mapping->nrpages == 0)
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 		__iget(inode);
 		__iget(inode);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+
 		/*
 		/*
-		 * We hold a reference to 'inode' so it couldn't have
-		 * been removed from s_inodes list while we dropped the
-		 * inode_lock.  We cannot iput the inode now as we can
-		 * be holding the last reference and we cannot iput it
-		 * under inode_lock. So we keep the reference and iput
-		 * it later.
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
 		 */
 		 */
 		iput(old_inode);
 		iput(old_inode);
 		old_inode = inode;
 		old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
 
 
 		cond_resched();
 		cond_resched();
 
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 	iput(old_inode);
 	iput(old_inode);
 }
 }
 
 
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
 		wbc.nr_to_write = 0;
 		wbc.nr_to_write = 0;
 
 
 	might_sleep();
 	might_sleep();
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
 	ret = writeback_single_inode(inode, &wbc);
 	ret = writeback_single_inode(inode, &wbc);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
 	if (sync)
 	if (sync)
 		inode_sync_wait(inode);
 		inode_sync_wait(inode);
 	return ret;
 	return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
 {
 	int ret;
 	int ret;
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
 	ret = writeback_single_inode(inode, wbc);
 	ret = writeback_single_inode(inode, wbc);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
 	return ret;
 	return ret;
 }
 }
 EXPORT_SYMBOL(sync_inode);
 EXPORT_SYMBOL(sync_inode);

+ 312 - 344
fs/inode.c

@@ -26,6 +26,38 @@
 #include <linux/posix_acl.h>
 #include <linux/posix_acl.h>
 #include <linux/ima.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
 #include <linux/cred.h>
+#include "internal.h"
+
+/*
+ * inode locking rules.
+ *
+ * inode->i_lock protects:
+ *   inode->i_state, inode->i_hash, __iget()
+ * inode_lru_lock protects:
+ *   inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ *   sb->s_inodes, inode->i_sb_list
+ * inode_wb_list_lock protects:
+ *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * inode_hash_lock protects:
+ *   inode_hashtable, inode->i_hash
+ *
+ * Lock ordering:
+ *
+ * inode_sb_list_lock
+ *   inode->i_lock
+ *     inode_lru_lock
+ *
+ * inode_wb_list_lock
+ *   inode->i_lock
+ *
+ * inode_hash_lock
+ *   inode_sb_list_lock
+ *   inode->i_lock
+ *
+ * iunique_lock
+ *   inode_hash_lock
+ */
 
 
 /*
 /*
  * This is needed for the following functions:
  * This is needed for the following functions:
@@ -60,6 +92,8 @@
 
 
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 
 /*
 /*
  * Each inode can be on two separate lists. One is
  * Each inode can be on two separate lists. One is
@@ -74,15 +108,10 @@ static unsigned int i_hash_shift __read_mostly;
  */
  */
 
 
 static LIST_HEAD(inode_lru);
 static LIST_HEAD(inode_lru);
-static struct hlist_head *inode_hashtable __read_mostly;
+static DEFINE_SPINLOCK(inode_lru_lock);
 
 
-/*
- * A simple spinlock to protect the list manipulations.
- *
- * NOTE! You also have to own the lock if you change
- * the i_state of an inode while it is in use..
- */
-DEFINE_SPINLOCK(inode_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 
 
 /*
 /*
  * iprune_sem provides exclusion between the icache shrinking and the
  * iprune_sem provides exclusion between the icache shrinking and the
@@ -137,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write,
 }
 }
 #endif
 #endif
 
 
-static void wake_up_inode(struct inode *inode)
-{
-	/*
-	 * Prevent speculative execution through spin_unlock(&inode_lock);
-	 */
-	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
-}
-
 /**
 /**
  * inode_init_always - perform inode structure intialisation
  * inode_init_always - perform inode structure intialisation
  * @sb: superblock inode belongs to
  * @sb: superblock inode belongs to
@@ -336,7 +356,7 @@ static void init_once(void *foo)
 }
 }
 
 
 /*
 /*
- * inode_lock must be held
+ * inode->i_lock must be held
  */
  */
 void __iget(struct inode *inode)
 void __iget(struct inode *inode)
 {
 {
@@ -354,23 +374,22 @@ EXPORT_SYMBOL(ihold);
 
 
 static void inode_lru_list_add(struct inode *inode)
 static void inode_lru_list_add(struct inode *inode)
 {
 {
+	spin_lock(&inode_lru_lock);
 	if (list_empty(&inode->i_lru)) {
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
 		list_add(&inode->i_lru, &inode_lru);
 		inodes_stat.nr_unused++;
 		inodes_stat.nr_unused++;
 	}
 	}
+	spin_unlock(&inode_lru_lock);
 }
 }
 
 
 static void inode_lru_list_del(struct inode *inode)
 static void inode_lru_list_del(struct inode *inode)
 {
 {
+	spin_lock(&inode_lru_lock);
 	if (!list_empty(&inode->i_lru)) {
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
 		list_del_init(&inode->i_lru);
 		inodes_stat.nr_unused--;
 		inodes_stat.nr_unused--;
 	}
 	}
-}
-
-static inline void __inode_sb_list_add(struct inode *inode)
-{
-	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+	spin_unlock(&inode_lru_lock);
 }
 }
 
 
 /**
 /**
@@ -379,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
  */
  */
 void inode_sb_list_add(struct inode *inode)
 void inode_sb_list_add(struct inode *inode)
 {
 {
-	spin_lock(&inode_lock);
-	__inode_sb_list_add(inode);
-	spin_unlock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
+	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+	spin_unlock(&inode_sb_list_lock);
 }
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
 
-static inline void __inode_sb_list_del(struct inode *inode)
+static inline void inode_sb_list_del(struct inode *inode)
 {
 {
+	spin_lock(&inode_sb_list_lock);
 	list_del_init(&inode->i_sb_list);
 	list_del_init(&inode->i_sb_list);
+	spin_unlock(&inode_sb_list_lock);
 }
 }
 
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -412,23 +433,14 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
 {
 	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
 	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
+	spin_lock(&inode->i_lock);
 	hlist_add_head(&inode->i_hash, b);
 	hlist_add_head(&inode->i_hash, b);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
 }
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 EXPORT_SYMBOL(__insert_inode_hash);
 
 
-/**
- *	__remove_inode_hash - remove an inode from the hash
- *	@inode: inode to unhash
- *
- *	Remove an inode from the superblock.
- */
-static void __remove_inode_hash(struct inode *inode)
-{
-	hlist_del_init(&inode->i_hash);
-}
-
 /**
 /**
  *	remove_inode_hash - remove an inode from the hash
  *	remove_inode_hash - remove an inode from the hash
  *	@inode: inode to unhash
  *	@inode: inode to unhash
@@ -437,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode)
  */
  */
 void remove_inode_hash(struct inode *inode)
 void remove_inode_hash(struct inode *inode)
 {
 {
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
+	spin_lock(&inode->i_lock);
 	hlist_del_init(&inode->i_hash);
 	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
 }
 }
 EXPORT_SYMBOL(remove_inode_hash);
 EXPORT_SYMBOL(remove_inode_hash);
 
 
@@ -456,10 +470,29 @@ void end_writeback(struct inode *inode)
 }
 }
 EXPORT_SYMBOL(end_writeback);
 EXPORT_SYMBOL(end_writeback);
 
 
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
 static void evict(struct inode *inode)
 static void evict(struct inode *inode)
 {
 {
 	const struct super_operations *op = inode->i_sb->s_op;
 	const struct super_operations *op = inode->i_sb->s_op;
 
 
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(!list_empty(&inode->i_lru));
+
+	inode_wb_list_del(inode);
+	inode_sb_list_del(inode);
+
 	if (op->evict_inode) {
 	if (op->evict_inode) {
 		op->evict_inode(inode);
 		op->evict_inode(inode);
 	} else {
 	} else {
@@ -471,6 +504,15 @@ static void evict(struct inode *inode)
 		bd_forget(inode);
 		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
 		cd_forget(inode);
+
+	remove_inode_hash(inode);
+
+	spin_lock(&inode->i_lock);
+	wake_up_bit(&inode->i_state, __I_NEW);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	spin_unlock(&inode->i_lock);
+
+	destroy_inode(inode);
 }
 }
 
 
 /*
 /*
@@ -489,14 +531,6 @@ static void dispose_list(struct list_head *head)
 		list_del_init(&inode->i_lru);
 		list_del_init(&inode->i_lru);
 
 
 		evict(inode);
 		evict(inode);
-
-		spin_lock(&inode_lock);
-		__remove_inode_hash(inode);
-		__inode_sb_list_del(inode);
-		spin_unlock(&inode_lock);
-
-		wake_up_inode(inode);
-		destroy_inode(inode);
 	}
 	}
 }
 }
 
 
@@ -514,25 +548,23 @@ void evict_inodes(struct super_block *sb)
 	struct inode *inode, *next;
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 	LIST_HEAD(dispose);
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 		if (atomic_read(&inode->i_count))
 			continue;
 			continue;
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 
 
 		inode->i_state |= I_FREEING;
 		inode->i_state |= I_FREEING;
-
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
-		list_move(&inode->i_lru, &dispose);
-		list_del_init(&inode->i_wb_list);
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			inodes_stat.nr_unused--;
+		inode_lru_list_del(inode);
+		spin_unlock(&inode->i_lock);
+		list_add(&inode->i_lru, &dispose);
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 
 
 	dispose_list(&dispose);
 	dispose_list(&dispose);
 
 
@@ -561,31 +593,30 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	struct inode *inode, *next;
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 	LIST_HEAD(dispose);
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 		if (inode->i_state & I_DIRTY && !kill_dirty) {
 		if (inode->i_state & I_DIRTY && !kill_dirty) {
+			spin_unlock(&inode->i_lock);
 			busy = 1;
 			busy = 1;
 			continue;
 			continue;
 		}
 		}
 		if (atomic_read(&inode->i_count)) {
 		if (atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
 			busy = 1;
 			busy = 1;
 			continue;
 			continue;
 		}
 		}
 
 
 		inode->i_state |= I_FREEING;
 		inode->i_state |= I_FREEING;
-
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
-		list_move(&inode->i_lru, &dispose);
-		list_del_init(&inode->i_wb_list);
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			inodes_stat.nr_unused--;
+		inode_lru_list_del(inode);
+		spin_unlock(&inode->i_lock);
+		list_add(&inode->i_lru, &dispose);
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 
 
 	dispose_list(&dispose);
 	dispose_list(&dispose);
 
 
@@ -607,7 +638,7 @@ static int can_unuse(struct inode *inode)
 
 
 /*
 /*
  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lru_lock by dispose_list().
  *
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -628,7 +659,7 @@ static void prune_icache(int nr_to_scan)
 	unsigned long reap = 0;
 	unsigned long reap = 0;
 
 
 	down_read(&iprune_sem);
 	down_read(&iprune_sem);
-	spin_lock(&inode_lock);
+	spin_lock(&inode_lru_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
 		struct inode *inode;
 
 
@@ -637,6 +668,16 @@ static void prune_icache(int nr_to_scan)
 
 
 		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 
 
+		/*
+		 * we are inverting the inode_lru_lock/inode->i_lock here,
+		 * so use a trylock. If we fail to get the lock, just move the
+		 * inode to the back of the list so we don't spin on it.
+		 */
+		if (!spin_trylock(&inode->i_lock)) {
+			list_move(&inode->i_lru, &inode_lru);
+			continue;
+		}
+
 		/*
 		/*
 		 * Referenced or dirty inodes are still in use. Give them
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 * another pass through the LRU as we canot reclaim them now.
@@ -644,47 +685,51 @@ static void prune_icache(int nr_to_scan)
 		if (atomic_read(&inode->i_count) ||
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
 			list_del_init(&inode->i_lru);
+			spin_unlock(&inode->i_lock);
 			inodes_stat.nr_unused--;
 			inodes_stat.nr_unused--;
 			continue;
 			continue;
 		}
 		}
 
 
 		/* recently referenced inodes get one more pass */
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
 		if (inode->i_state & I_REFERENCED) {
-			list_move(&inode->i_lru, &inode_lru);
 			inode->i_state &= ~I_REFERENCED;
 			inode->i_state &= ~I_REFERENCED;
+			list_move(&inode->i_lru, &inode_lru);
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
 		}
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			__iget(inode);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_lru_lock);
 			if (remove_inode_buffers(inode))
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 								0, -1);
 			iput(inode);
 			iput(inode);
-			spin_lock(&inode_lock);
+			spin_lock(&inode_lru_lock);
 
 
 			if (inode != list_entry(inode_lru.next,
 			if (inode != list_entry(inode_lru.next,
 						struct inode, i_lru))
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
 				continue;	/* wrong inode or list_empty */
-			if (!can_unuse(inode))
+			/* avoid lock inversions with trylock */
+			if (!spin_trylock(&inode->i_lock))
+				continue;
+			if (!can_unuse(inode)) {
+				spin_unlock(&inode->i_lock);
 				continue;
 				continue;
+			}
 		}
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
 		inode->i_state |= I_FREEING;
+		spin_unlock(&inode->i_lock);
 
 
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
 		list_move(&inode->i_lru, &freeable);
 		list_move(&inode->i_lru, &freeable);
-		list_del_init(&inode->i_wb_list);
 		inodes_stat.nr_unused--;
 		inodes_stat.nr_unused--;
 	}
 	}
 	if (current_is_kswapd())
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 		__count_vm_events(PGINODESTEAL, reap);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_lru_lock);
 
 
 	dispose_list(&freeable);
 	dispose_list(&freeable);
 	up_read(&iprune_sem);
 	up_read(&iprune_sem);
@@ -733,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb,
 
 
 repeat:
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
 	hlist_for_each_entry(inode, node, head, i_hash) {
-		if (inode->i_sb != sb)
+		spin_lock(&inode->i_lock);
+		if (inode->i_sb != sb) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
-		if (!test(inode, data))
+		}
+		if (!test(inode, data)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 			goto repeat;
 		}
 		}
 		__iget(inode);
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		return inode;
 		return inode;
 	}
 	}
 	return NULL;
 	return NULL;
@@ -759,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
 
 
 repeat:
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
 	hlist_for_each_entry(inode, node, head, i_hash) {
-		if (inode->i_ino != ino)
+		spin_lock(&inode->i_lock);
+		if (inode->i_ino != ino) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
-		if (inode->i_sb != sb)
+		}
+		if (inode->i_sb != sb) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 			goto repeat;
 		}
 		}
 		__iget(inode);
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		return inode;
 		return inode;
 	}
 	}
 	return NULL;
 	return NULL;
@@ -827,19 +884,26 @@ struct inode *new_inode(struct super_block *sb)
 {
 {
 	struct inode *inode;
 	struct inode *inode;
 
 
-	spin_lock_prefetch(&inode_lock);
+	spin_lock_prefetch(&inode_sb_list_lock);
 
 
 	inode = alloc_inode(sb);
 	inode = alloc_inode(sb);
 	if (inode) {
 	if (inode) {
-		spin_lock(&inode_lock);
-		__inode_sb_list_add(inode);
+		spin_lock(&inode->i_lock);
 		inode->i_state = 0;
 		inode->i_state = 0;
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode->i_lock);
+		inode_sb_list_add(inode);
 	}
 	}
 	return inode;
 	return inode;
 }
 }
 EXPORT_SYMBOL(new_inode);
 EXPORT_SYMBOL(new_inode);
 
 
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:	new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
 void unlock_new_inode(struct inode *inode)
 void unlock_new_inode(struct inode *inode)
 {
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -859,51 +923,67 @@ void unlock_new_inode(struct inode *inode)
 		}
 		}
 	}
 	}
 #endif
 #endif
-	/*
-	 * This is special!  We do not need the spinlock when clearing I_NEW,
-	 * because we're guaranteed that nobody else tries to do anything about
-	 * the state of the inode when it is locked, as we just created it (so
-	 * there can be no old holders that haven't tested I_NEW).
-	 * However we must emit the memory barrier so that other CPUs reliably
-	 * see the clearing of I_NEW after the other inode initialisation has
-	 * completed.
-	 */
-	smp_mb();
+	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW;
 	inode->i_state &= ~I_NEW;
-	wake_up_inode(inode);
+	wake_up_bit(&inode->i_state, __I_NEW);
+	spin_unlock(&inode->i_lock);
 }
 }
 EXPORT_SYMBOL(unlock_new_inode);
 EXPORT_SYMBOL(unlock_new_inode);
 
 
-/*
- * This is called without the inode lock held.. Be careful.
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @hashval:	hash value (usually inode number) to get
+ * @test:	callback used for comparisons between inodes
+ * @set:	callback used to initialize a new struct inode
+ * @data:	opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
  *
  *
- * We no longer cache the sb_flags in i_flags - see fs.h
- *	-- rmk@arm.uk.linux.org
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
  */
  */
-static struct inode *get_new_inode(struct super_block *sb,
-				struct hlist_head *head,
-				int (*test)(struct inode *, void *),
-				int (*set)(struct inode *, void *),
-				void *data)
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *), void *data)
 {
 {
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 	struct inode *inode;
 
 
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
+
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
 	inode = alloc_inode(sb);
 	inode = alloc_inode(sb);
 	if (inode) {
 	if (inode) {
 		struct inode *old;
 		struct inode *old;
 
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
 		/* We released the lock, so.. */
 		old = find_inode(sb, head, test, data);
 		old = find_inode(sb, head, test, data);
 		if (!old) {
 		if (!old) {
 			if (set(inode, data))
 			if (set(inode, data))
 				goto set_failed;
 				goto set_failed;
 
 
-			hlist_add_head(&inode->i_hash, head);
-			__inode_sb_list_add(inode);
+			spin_lock(&inode->i_lock);
 			inode->i_state = I_NEW;
 			inode->i_state = I_NEW;
-			spin_unlock(&inode_lock);
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			inode_sb_list_add(inode);
+			spin_unlock(&inode_hash_lock);
 
 
 			/* Return the locked inode with I_NEW set, the
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
 			 * caller is responsible for filling in the contents
@@ -916,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 * allocated.
 		 */
 		 */
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
 		destroy_inode(inode);
 		inode = old;
 		inode = old;
 		wait_on_inode(inode);
 		wait_on_inode(inode);
@@ -924,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb,
 	return inode;
 	return inode;
 
 
 set_failed:
 set_failed:
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 	destroy_inode(inode);
 	destroy_inode(inode);
 	return NULL;
 	return NULL;
 }
 }
+EXPORT_SYMBOL(iget5_locked);
 
 
-/*
- * get_new_inode_fast is the fast path version of get_new_inode, see the
- * comment at iget_locked for details.
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @ino:	inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set.  The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
  */
  */
-static struct inode *get_new_inode_fast(struct super_block *sb,
-				struct hlist_head *head, unsigned long ino)
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
 {
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
 	struct inode *inode;
 
 
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
 	inode = alloc_inode(sb);
 	inode = alloc_inode(sb);
 	if (inode) {
 	if (inode) {
 		struct inode *old;
 		struct inode *old;
 
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
 		/* We released the lock, so.. */
 		old = find_inode_fast(sb, head, ino);
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
 		if (!old) {
 			inode->i_ino = ino;
 			inode->i_ino = ino;
-			hlist_add_head(&inode->i_hash, head);
-			__inode_sb_list_add(inode);
+			spin_lock(&inode->i_lock);
 			inode->i_state = I_NEW;
 			inode->i_state = I_NEW;
-			spin_unlock(&inode_lock);
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			inode_sb_list_add(inode);
+			spin_unlock(&inode_hash_lock);
 
 
 			/* Return the locked inode with I_NEW set, the
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
 			 * caller is responsible for filling in the contents
@@ -963,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 * allocated.
 		 */
 		 */
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
 		destroy_inode(inode);
 		inode = old;
 		inode = old;
 		wait_on_inode(inode);
 		wait_on_inode(inode);
 	}
 	}
 	return inode;
 	return inode;
 }
 }
+EXPORT_SYMBOL(iget_locked);
 
 
 /*
 /*
  * search the inode cache for a matching inode number.
  * search the inode cache for a matching inode number.
@@ -984,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
 	struct hlist_node *node;
 	struct hlist_node *node;
 	struct inode *inode;
 	struct inode *inode;
 
 
+	spin_lock(&inode_hash_lock);
 	hlist_for_each_entry(inode, node, b, i_hash) {
 	hlist_for_each_entry(inode, node, b, i_hash) {
-		if (inode->i_ino == ino && inode->i_sb == sb)
+		if (inode->i_ino == ino && inode->i_sb == sb) {
+			spin_unlock(&inode_hash_lock);
 			return 0;
 			return 0;
+		}
 	}
 	}
+	spin_unlock(&inode_hash_lock);
 
 
 	return 1;
 	return 1;
 }
 }
@@ -1017,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 	static unsigned int counter;
 	static unsigned int counter;
 	ino_t res;
 	ino_t res;
 
 
-	spin_lock(&inode_lock);
 	spin_lock(&iunique_lock);
 	spin_lock(&iunique_lock);
 	do {
 	do {
 		if (counter <= max_reserved)
 		if (counter <= max_reserved)
@@ -1025,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 		res = counter++;
 		res = counter++;
 	} while (!test_inode_iunique(sb, res));
 	} while (!test_inode_iunique(sb, res));
 	spin_unlock(&iunique_lock);
 	spin_unlock(&iunique_lock);
-	spin_unlock(&inode_lock);
 
 
 	return res;
 	return res;
 }
 }
@@ -1033,89 +1136,23 @@ EXPORT_SYMBOL(iunique);
 
 
 struct inode *igrab(struct inode *inode)
 struct inode *igrab(struct inode *inode)
 {
 {
-	spin_lock(&inode_lock);
-	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
+	spin_lock(&inode->i_lock);
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
 		__iget(inode);
 		__iget(inode);
-	else
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
 		/*
 		/*
 		 * Handle the case where s_op->clear_inode is not been
 		 * Handle the case where s_op->clear_inode is not been
 		 * called yet, and somebody is calling igrab
 		 * called yet, and somebody is calling igrab
 		 * while the inode is getting freed.
 		 * while the inode is getting freed.
 		 */
 		 */
 		inode = NULL;
 		inode = NULL;
-	spin_unlock(&inode_lock);
+	}
 	return inode;
 	return inode;
 }
 }
 EXPORT_SYMBOL(igrab);
 EXPORT_SYMBOL(igrab);
 
 
-/**
- * ifind - internal function, you want ilookup5() or iget5().
- * @sb:		super block of file system to search
- * @head:       the head of the list to search
- * @test:	callback used for comparisons between inodes
- * @data:	opaque data pointer to pass to @test
- * @wait:	if true wait for the inode to be unlocked, if false do not
- *
- * ifind() searches for the inode specified by @data in the inode
- * cache. This is a generalized version of ifind_fast() for file systems where
- * the inode number is not sufficient for unique identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- *
- * Note, @test is called with the inode_lock held, so can't sleep.
- */
-static struct inode *ifind(struct super_block *sb,
-		struct hlist_head *head, int (*test)(struct inode *, void *),
-		void *data, const int wait)
-{
-	struct inode *inode;
-
-	spin_lock(&inode_lock);
-	inode = find_inode(sb, head, test, data);
-	if (inode) {
-		spin_unlock(&inode_lock);
-		if (likely(wait))
-			wait_on_inode(inode);
-		return inode;
-	}
-	spin_unlock(&inode_lock);
-	return NULL;
-}
-
-/**
- * ifind_fast - internal function, you want ilookup() or iget().
- * @sb:		super block of file system to search
- * @head:       head of the list to search
- * @ino:	inode number to search for
- *
- * ifind_fast() searches for the inode @ino in the inode cache. This is for
- * file systems where the inode number is sufficient for unique identification
- * of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- */
-static struct inode *ifind_fast(struct super_block *sb,
-		struct hlist_head *head, unsigned long ino)
-{
-	struct inode *inode;
-
-	spin_lock(&inode_lock);
-	inode = find_inode_fast(sb, head, ino);
-	if (inode) {
-		spin_unlock(&inode_lock);
-		wait_on_inode(inode);
-		return inode;
-	}
-	spin_unlock(&inode_lock);
-	return NULL;
-}
-
 /**
 /**
  * ilookup5_nowait - search for an inode in the inode cache
  * ilookup5_nowait - search for an inode in the inode cache
  * @sb:		super block of file system to search
  * @sb:		super block of file system to search
@@ -1123,26 +1160,26 @@ static struct inode *ifind_fast(struct super_block *sb,
  * @test:	callback used for comparisons between inodes
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
  * @data:	opaque data pointer to pass to @test
  *
  *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
+ * Search for the inode specified by @hashval and @data in the inode cache.
  * If the inode is in the cache, the inode is returned with an incremented
  * If the inode is in the cache, the inode is returned with an incremented
- * reference count.  Note, the inode lock is not waited upon so you have to be
- * very careful what you do with the returned inode.  You probably should be
- * using ilookup5() instead.
+ * reference count.
  *
  *
- * Otherwise NULL is returned.
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode.  You probably should be using ilookup5() instead.
  *
  *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
  */
  */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 		int (*test)(struct inode *, void *), void *data)
 {
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
 
 
-	return ifind(sb, head, test, data, 0);
+	return inode;
 }
 }
 EXPORT_SYMBOL(ilookup5_nowait);
 EXPORT_SYMBOL(ilookup5_nowait);
 
 
@@ -1153,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
  * @test:	callback used for comparisons between inodes
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
  * @data:	opaque data pointer to pass to @test
  *
  *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode lock is waited upon and the inode is
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if the inode is in the cache, return the inode with an incremented
+ * reference count.  Waits on I_NEW before returning the inode.
  * returned with an incremented reference count.
  * returned with an incremented reference count.
  *
  *
- * Otherwise NULL is returned.
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
  *
  *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
  */
  */
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 		int (*test)(struct inode *, void *), void *data)
 {
 {
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
 
 
-	return ifind(sb, head, test, data, 1);
+	if (inode)
+		wait_on_inode(inode);
+	return inode;
 }
 }
 EXPORT_SYMBOL(ilookup5);
 EXPORT_SYMBOL(ilookup5);
 
 
@@ -1179,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5);
  * @sb:		super block of file system to search
  * @sb:		super block of file system to search
  * @ino:	inode number to search for
  * @ino:	inode number to search for
  *
  *
- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
- * This is for file systems where the inode number is sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
+ * Search for the inode @ino in the inode cache, and if the inode is in the
+ * cache, the inode is returned with an incremented reference count.
  */
  */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
-
-	return ifind_fast(sb, head, ino);
-}
-EXPORT_SYMBOL(ilookup);
-
-/**
- * iget5_locked - obtain an inode from a mounted file system
- * @sb:		super block of file system
- * @hashval:	hash value (usually inode number) to get
- * @test:	callback used for comparisons between inodes
- * @set:	callback used to initialize a new struct inode
- * @data:	opaque data pointer to pass to @test and @set
- *
- * iget5_locked() uses ifind() to search for the inode specified by @hashval
- * and @data in the inode cache and if present it is returned with an increased
- * reference count. This is a generalized version of iget_locked() for file
- * systems where the inode number is not sufficient for unique identification
- * of an inode.
- *
- * If the inode is not in cache, get_new_inode() is called to allocate a new
- * inode and this is returned locked, hashed, and with the I_NEW flag set. The
- * file system gets to fill it in before unlocking it via unlock_new_inode().
- *
- * Note both @test and @set are called with the inode_lock held, so can't sleep.
- */
-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
-		int (*test)(struct inode *, void *),
-		int (*set)(struct inode *, void *), void *data)
-{
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 	struct inode *inode;
 
 
-	inode = ifind(sb, head, test, data, 1);
-	if (inode)
-		return inode;
-	/*
-	 * get_new_inode() will do the right thing, re-trying the search
-	 * in case it had to block at any point.
-	 */
-	return get_new_inode(sb, head, test, set, data);
-}
-EXPORT_SYMBOL(iget5_locked);
-
-/**
- * iget_locked - obtain an inode from a mounted file system
- * @sb:		super block of file system
- * @ino:	inode number to get
- *
- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
- * the inode cache and if present it is returned with an increased reference
- * count. This is for file systems where the inode number is sufficient for
- * unique identification of an inode.
- *
- * If the inode is not in cache, get_new_inode_fast() is called to allocate a
- * new inode and this is returned locked, hashed, and with the I_NEW flag set.
- * The file system gets to fill it in before unlocking it via
- * unlock_new_inode().
- */
-struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-{
-	struct hlist_head *head = inode_hashtable + hash(sb, ino);
-	struct inode *inode;
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
 
 
-	inode = ifind_fast(sb, head, ino);
 	if (inode)
 	if (inode)
-		return inode;
-	/*
-	 * get_new_inode_fast() will do the right thing, re-trying the search
-	 * in case it had to block at any point.
-	 */
-	return get_new_inode_fast(sb, head, ino);
+		wait_on_inode(inode);
+	return inode;
 }
 }
-EXPORT_SYMBOL(iget_locked);
+EXPORT_SYMBOL(ilookup);
 
 
 int insert_inode_locked(struct inode *inode)
 int insert_inode_locked(struct inode *inode)
 {
 {
@@ -1271,27 +1240,33 @@ int insert_inode_locked(struct inode *inode)
 	ino_t ino = inode->i_ino;
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 
 
-	inode->i_state |= I_NEW;
 	while (1) {
 	while (1) {
 		struct hlist_node *node;
 		struct hlist_node *node;
 		struct inode *old = NULL;
 		struct inode *old = NULL;
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		hlist_for_each_entry(old, node, head, i_hash) {
 		hlist_for_each_entry(old, node, head, i_hash) {
 			if (old->i_ino != ino)
 			if (old->i_ino != ino)
 				continue;
 				continue;
 			if (old->i_sb != sb)
 			if (old->i_sb != sb)
 				continue;
 				continue;
-			if (old->i_state & (I_FREEING|I_WILL_FREE))
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
 				continue;
 				continue;
+			}
 			break;
 			break;
 		}
 		}
 		if (likely(!node)) {
 		if (likely(!node)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			hlist_add_head(&inode->i_hash, head);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_hash_lock);
 			return 0;
 			return 0;
 		}
 		}
 		__iget(old);
 		__iget(old);
-		spin_unlock(&inode_lock);
+		spin_unlock(&old->i_lock);
+		spin_unlock(&inode_hash_lock);
 		wait_on_inode(old);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			iput(old);
@@ -1308,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 	struct super_block *sb = inode->i_sb;
 	struct super_block *sb = inode->i_sb;
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 
 
-	inode->i_state |= I_NEW;
-
 	while (1) {
 	while (1) {
 		struct hlist_node *node;
 		struct hlist_node *node;
 		struct inode *old = NULL;
 		struct inode *old = NULL;
 
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		hlist_for_each_entry(old, node, head, i_hash) {
 		hlist_for_each_entry(old, node, head, i_hash) {
 			if (old->i_sb != sb)
 			if (old->i_sb != sb)
 				continue;
 				continue;
 			if (!test(old, data))
 			if (!test(old, data))
 				continue;
 				continue;
-			if (old->i_state & (I_FREEING|I_WILL_FREE))
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
 				continue;
 				continue;
+			}
 			break;
 			break;
 		}
 		}
 		if (likely(!node)) {
 		if (likely(!node)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			hlist_add_head(&inode->i_hash, head);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_hash_lock);
 			return 0;
 			return 0;
 		}
 		}
 		__iget(old);
 		__iget(old);
-		spin_unlock(&inode_lock);
+		spin_unlock(&old->i_lock);
+		spin_unlock(&inode_hash_lock);
 		wait_on_inode(old);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			iput(old);
@@ -1375,47 +1355,35 @@ static void iput_final(struct inode *inode)
 	const struct super_operations *op = inode->i_sb->s_op;
 	const struct super_operations *op = inode->i_sb->s_op;
 	int drop;
 	int drop;
 
 
+	WARN_ON(inode->i_state & I_NEW);
+
 	if (op && op->drop_inode)
 	if (op && op->drop_inode)
 		drop = op->drop_inode(inode);
 		drop = op->drop_inode(inode);
 	else
 	else
 		drop = generic_drop_inode(inode);
 		drop = generic_drop_inode(inode);
 
 
+	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+		inode->i_state |= I_REFERENCED;
+		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+			inode_lru_list_add(inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
 	if (!drop) {
 	if (!drop) {
-		if (sb->s_flags & MS_ACTIVE) {
-			inode->i_state |= I_REFERENCED;
-			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-				inode_lru_list_add(inode);
-			}
-			spin_unlock(&inode_lock);
-			return;
-		}
-		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
 		inode->i_state |= I_WILL_FREE;
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode->i_lock);
 		write_inode_now(inode, 1);
 		write_inode_now(inode, 1);
-		spin_lock(&inode_lock);
+		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 		inode->i_state &= ~I_WILL_FREE;
-		__remove_inode_hash(inode);
 	}
 	}
 
 
-	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
 	inode->i_state |= I_FREEING;
-
-	/*
-	 * Move the inode off the IO lists and LRU once I_FREEING is
-	 * set so that it won't get moved back on there if it is dirty.
-	 */
 	inode_lru_list_del(inode);
 	inode_lru_list_del(inode);
-	list_del_init(&inode->i_wb_list);
+	spin_unlock(&inode->i_lock);
 
 
-	__inode_sb_list_del(inode);
-	spin_unlock(&inode_lock);
 	evict(inode);
 	evict(inode);
-	remove_inode_hash(inode);
-	wake_up_inode(inode);
-	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
-	destroy_inode(inode);
 }
 }
 
 
 /**
 /**
@@ -1432,7 +1400,7 @@ void iput(struct inode *inode)
 	if (inode) {
 	if (inode) {
 		BUG_ON(inode->i_state & I_CLEAR);
 		BUG_ON(inode->i_state & I_CLEAR);
 
 
-		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
 			iput_final(inode);
 			iput_final(inode);
 	}
 	}
 }
 }
@@ -1611,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait);
  * to recheck inode state.
  * to recheck inode state.
  *
  *
  * It doesn't matter if I_NEW is not set initially, a call to
  * It doesn't matter if I_NEW is not set initially, a call to
- * wake_up_inode() after removing from the hash list will DTRT.
- *
- * This is called with inode_lock held.
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
+ * will DTRT.
  */
  */
 static void __wait_on_freeing_inode(struct inode *inode)
 static void __wait_on_freeing_inode(struct inode *inode)
 {
 {
@@ -1621,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
 	schedule();
 	schedule();
 	finish_wait(wq, &wait.wait);
 	finish_wait(wq, &wait.wait);
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
 }
 }
 
 
 static __initdata unsigned long ihash_entries;
 static __initdata unsigned long ihash_entries;

+ 7 - 0
fs/internal.h

@@ -125,6 +125,13 @@ extern long do_handle_open(int mountdirfd,
 /*
 /*
  * inode.c
  * inode.c
  */
  */
+extern spinlock_t inode_sb_list_lock;
+
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
+
 extern int get_nr_dirty_inodes(void);
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 extern int invalidate_inodes(struct super_block *, bool);

+ 1 - 1
fs/logfs/inode.c

@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 	return ret;
 }
 }
 
 
-/* called with inode_lock held */
+/* called with inode->i_lock held */
 static int logfs_drop_inode(struct inode *inode)
 static int logfs_drop_inode(struct inode *inode)
 {
 {
 	struct logfs_super *super = logfs_super(inode->i_sb);
 	struct logfs_super *super = logfs_super(inode->i_sb);

+ 18 - 5
fs/namei.c

@@ -992,6 +992,12 @@ int follow_down_one(struct path *path)
 	return 0;
 	return 0;
 }
 }
 
 
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+		dentry->d_op->d_manage(dentry, true) < 0);
+}
+
 /*
 /*
  * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
  * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
  * meet a managed dentry and we're not walking to "..".  True is returned to
  * meet a managed dentry and we're not walking to "..".  True is returned to
@@ -1000,19 +1006,26 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 			       struct inode **inode, bool reverse_transit)
 {
 {
-	while (d_mountpoint(path->dentry)) {
+	for (;;) {
 		struct vfsmount *mounted;
 		struct vfsmount *mounted;
-		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-		    !reverse_transit &&
-		    path->dentry->d_op->d_manage(path->dentry, true) < 0)
+		/*
+		 * Don't forget we might have a non-mountpoint managed dentry
+		 * that wants to block transit.
+		 */
+		*inode = path->dentry->d_inode;
+		if (!reverse_transit &&
+		     unlikely(managed_dentry_might_block(path->dentry)))
 			return false;
 			return false;
+
+		if (!d_mountpoint(path->dentry))
+			break;
+
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 		if (!mounted)
 			break;
 			break;
 		path->mnt = mounted;
 		path->mnt = mounted;
 		path->dentry = mounted->mnt_root;
 		path->dentry = mounted->mnt_root;
 		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
 		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-		*inode = path->dentry->d_inode;
 	}
 	}
 
 
 	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
 	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))

+ 25 - 17
fs/notify/inode_mark.c

@@ -22,13 +22,14 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 
 
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 
 
 #include <linux/fsnotify_backend.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 #include "fsnotify.h"
 
 
+#include "../internal.h"
+
 /*
 /*
  * Recalculate the mask of events relevant to a given inode locked.
  * Recalculate the mask of events relevant to a given inode locked.
  */
  */
@@ -237,15 +238,14 @@ out:
  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
  * @list: list of inodes being unmounted (sb->s_inodes)
  * @list: list of inodes being unmounted (sb->s_inodes)
  *
  *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
  */
  */
 void fsnotify_unmount_inodes(struct list_head *list)
 void fsnotify_unmount_inodes(struct list_head *list)
 {
 {
 	struct inode *inode, *next_i, *need_iput = NULL;
 	struct inode *inode, *next_i, *need_iput = NULL;
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 		struct inode *need_iput_tmp;
 		struct inode *need_iput_tmp;
 
 
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 * the inode cannot have any associated watches.
 		 */
 		 */
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 
 
 		/*
 		/*
 		 * If i_count is zero, the inode cannot have any watches and
 		 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		 * evict all inodes with zero i_count from icache which is
 		 * evict all inodes with zero i_count from icache which is
 		 * unnecessarily violent and may in fact be illegal to do.
 		 * unnecessarily violent and may in fact be illegal to do.
 		 */
 		 */
-		if (!atomic_read(&inode->i_count))
+		if (!atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 
 
 		need_iput_tmp = need_iput;
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
 		need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
 			__iget(inode);
 			__iget(inode);
 		else
 		else
 			need_iput_tmp = NULL;
 			need_iput_tmp = NULL;
+		spin_unlock(&inode->i_lock);
 
 
 		/* In case the dropping of a reference would nuke next_i. */
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
 		if ((&next_i->i_sb_list != list) &&
-		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
+		    atomic_read(&next_i->i_count)) {
+			spin_lock(&next_i->i_lock);
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+				__iget(next_i);
+				need_iput = next_i;
+			}
+			spin_unlock(&next_i->i_lock);
 		}
 		}
 
 
 		/*
 		/*
-		 * We can safely drop inode_lock here because we hold
+		 * We can safely drop inode_sb_list_lock here because we hold
 		 * references on both inode and next_i.  Also no new inodes
 		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
+		 * will be added since the umount has begun.
 		 */
 		 */
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_sb_list_lock);
 
 
 		if (need_iput_tmp)
 		if (need_iput_tmp)
 			iput(need_iput_tmp);
 			iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 
 
 		iput(inode);
 		iput(inode);
 
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 }
 }

+ 0 - 1
fs/notify/mark.c

@@ -91,7 +91,6 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
 #include <linux/srcu.h>
-#include <linux/writeback.h> /* for inode_lock */
 
 
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 
 

+ 0 - 1
fs/notify/vfsmount_mark.c

@@ -23,7 +23,6 @@
 #include <linux/mount.h>
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 
 
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 
 

+ 2 - 2
fs/ntfs/inode.c

@@ -54,7 +54,7 @@
  *
  *
  * Return 1 if the attributes match and 0 if not.
  * Return 1 if the attributes match and 0 if not.
  *
  *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
  * allowed to sleep.
  * allowed to sleep.
  */
  */
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
  *
  *
  * Return 0 on success and -errno on error.
  * Return 0 on success and -errno on error.
  *
  *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
  * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
  * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
  */
  */
 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)

+ 23 - 18
fs/quota/dquot.c

@@ -76,7 +76,7 @@
 #include <linux/buffer_head.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/quotaops.h>
-#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
+#include "../internal.h" /* ugh */
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 
 
@@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	int reserved = 0;
 	int reserved = 0;
 #endif
 #endif
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    !atomic_read(&inode->i_writecount) ||
+		    !dqinit_needed(inode, type)) {
+			spin_unlock(&inode->i_lock);
 			continue;
 			continue;
+		}
 #ifdef CONFIG_QUOTA_DEBUG
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
 		if (unlikely(inode_get_rsv_space(inode) > 0))
 			reserved = 1;
 			reserved = 1;
 #endif
 #endif
-		if (!atomic_read(&inode->i_writecount))
-			continue;
-		if (!dqinit_needed(inode, type))
-			continue;
-
 		__iget(inode);
 		__iget(inode);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
 
 
 		iput(old_inode);
 		iput(old_inode);
 		__dquot_initialize(inode, type);
 		__dquot_initialize(inode, type);
-		/* We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the inode_lock.
-		 * We cannot iput the inode now as we can be holding the last
-		 * reference and we cannot iput it under inode_lock. So we
-		 * keep the reference and iput it later. */
+
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock We cannot iput the inode now as we can be
+		 * holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
 		old_inode = inode;
 		old_inode = inode;
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 	iput(old_inode);
 	iput(old_inode);
 
 
 #ifdef CONFIG_QUOTA_DEBUG
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 	struct inode *inode;
 	struct inode *inode;
 	int reserved = 0;
 	int reserved = 0;
 
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		/*
 		/*
 		 *  We have to scan also I_NEW inodes because they can already
 		 *  We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 			remove_inode_dquot_ref(inode, type, tofree_head);
 			remove_inode_dquot_ref(inode, type, tofree_head);
 		}
 		}
 	}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
 	if (reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
 		printk(KERN_WARNING "VFS (%s): Writes happened after quota"

+ 1 - 1
include/linux/fs.h

@@ -1636,7 +1636,7 @@ struct super_operations {
 };
 };
 
 
 /*
 /*
- * Inode state bits.  Protected by inode_lock.
+ * Inode state bits.  Protected by inode->i_lock
  *
  *
  * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
  * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
  * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
  * I_DIRTY_DATASYNC and I_DIRTY_PAGES.

+ 1 - 1
include/linux/quotaops.h

@@ -277,7 +277,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
 		/*
 		/*
 		 * Mark inode fully dirty. Since we are allocating blocks, inode
 		 * Mark inode fully dirty. Since we are allocating blocks, inode
 		 * would become fully dirty soon anyway and it reportedly
 		 * would become fully dirty soon anyway and it reportedly
-		 * reduces inode_lock contention.
+		 * reduces lock contention.
 		 */
 		 */
 		mark_inode_dirty(inode);
 		mark_inode_dirty(inode);
 	}
 	}

+ 1 - 1
include/linux/writeback.h

@@ -9,7 +9,7 @@
 
 
 struct backing_dev_info;
 struct backing_dev_info;
 
 
-extern spinlock_t inode_lock;
+extern spinlock_t inode_wb_list_lock;
 
 
 /*
 /*
  * fs/fs-writeback.c
  * fs/fs-writeback.c

+ 4 - 4
mm/backing-dev.c

@@ -67,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	struct inode *inode;
 	struct inode *inode;
 
 
 	nr_wb = nr_dirty = nr_io = nr_more_io = 0;
 	nr_wb = nr_dirty = nr_io = nr_more_io = 0;
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
 		nr_dirty++;
 	list_for_each_entry(inode, &wb->b_io, i_wb_list)
 	list_for_each_entry(inode, &wb->b_io, i_wb_list)
 		nr_io++;
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
 		nr_more_io++;
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -676,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	if (bdi_has_dirty_io(bdi)) {
 	if (bdi_has_dirty_io(bdi)) {
 		struct bdi_writeback *dst = &default_backing_dev_info.wb;
 		struct bdi_writeback *dst = &default_backing_dev_info.wb;
 
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 		list_splice(&bdi->wb.b_io, &dst->b_io);
 		list_splice(&bdi->wb.b_io, &dst->b_io);
 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_wb_list_lock);
 	}
 	}
 
 
 	bdi_unregister(bdi);
 	bdi_unregister(bdi);

+ 6 - 4
mm/filemap.c

@@ -80,8 +80,8 @@
  *  ->i_mutex
  *  ->i_mutex
  *    ->i_alloc_sem             (various)
  *    ->i_alloc_sem             (various)
  *
  *
- *  ->inode_lock
- *    ->sb_lock			(fs/fs-writeback.c)
+ *  inode_wb_list_lock
+ *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
  *
  *  ->i_mmap_lock
  *  ->i_mmap_lock
@@ -98,8 +98,10 @@
  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode_lock		(zap_pte_range->set_page_dirty)
+ *    inode_wb_list_lock	(page_remove_rmap->set_page_dirty)
+ *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
+ *    inode_wb_list_lock	(zap_pte_range->set_page_dirty)
+ *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
  *
  *  (code doesn't rely on that order, so you could switch it around)
  *  (code doesn't rely on that order, so you could switch it around)

+ 3 - 2
mm/rmap.c

@@ -31,11 +31,12 @@
  *             swap_lock (in swap_duplicate, swap_info_get)
  *             swap_lock (in swap_duplicate, swap_info_get)
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
  *               mapping->private_lock (in __set_page_dirty_buffers)
- *               inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within inode_lock in __sync_single_inode)
+ *                           within inode_wb_list_lock in __sync_single_inode)
  *
  *
  * (code doesn't rely on that order so it could be switched around)
  * (code doesn't rely on that order so it could be switched around)
  * ->tasklist_lock
  * ->tasklist_lock