9 лет назад · faeb20ecfa
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
 
				 #define rsv_start rsv_window._rsv_start
			
 
				 #define rsv_end rsv_window._rsv_end
			
 
				 
			
 
				+struct mb_cache;
			
 
				+
			
 
				 /*
			
 
				  * second extended-fs super-block data in memory
			
 
				  */
			
@@ -111,6 +113,7 @@ struct ext2_sb_info {
 
				 	 * of the mount options.
			
 
				 	 */
			
 
				 	spinlock_t s_lock;
			
 
				+	struct mb_cache *s_mb_cache;
			
 
				 };
			
 
				 
			
 
				 static inline spinlock_t *
			
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
 
				 
			
 
				 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
			
 
				 
			
 
				-	ext2_xattr_put_super(sb);
			
 
				+	if (sbi->s_mb_cache) {
			
 
				+		ext2_xattr_destroy_cache(sbi->s_mb_cache);
			
 
				+		sbi->s_mb_cache = NULL;
			
 
				+	}
			
 
				 	if (!(sb->s_flags & MS_RDONLY)) {
			
 
				 		struct ext2_super_block *es = sbi->s_es;
			
 
				 
			
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
				 		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
			
 
				 		goto failed_mount3;
			
 
				 	}
			
 
				+
			
 
				+#ifdef CONFIG_EXT2_FS_XATTR
			
 
				+	sbi->s_mb_cache = ext2_xattr_create_cache();
			
 
				+	if (!sbi->s_mb_cache) {
			
 
				+		ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
			
 
				+		goto failed_mount3;
			
 
				+	}
			
 
				+#endif
			
 
				 	/*
			
 
				 	 * set up enough so that it can read an inode
			
 
				 	 */
			
@@ -1149,6 +1160,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
				 			sb->s_id);
			
 
				 	goto failed_mount;
			
 
				 failed_mount3:
			
 
				+	if (sbi->s_mb_cache)
			
 
				+		ext2_xattr_destroy_cache(sbi->s_mb_cache);
			
 
				 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
			
 
				 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
			
 
				 	percpu_counter_destroy(&sbi->s_dirs_counter);
			
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
 
				 
			
 
				 static int __init init_ext2_fs(void)
			
 
				 {
			
 
				-	int err = init_ext2_xattr();
			
 
				-	if (err)
			
 
				-		return err;
			
 
				+	int err;
			
 
				+
			
 
				 	err = init_inodecache();
			
 
				 	if (err)
			
 
				-		goto out1;
			
 
				+		return err;
			
 
				         err = register_filesystem(&ext2_fs_type);
			
 
				 	if (err)
			
 
				 		goto out;
			
 
				 	return 0;
			
 
				 out:
			
 
				 	destroy_inodecache();
			
 
				-out1:
			
 
				-	exit_ext2_xattr();
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
 
				 {
			
 
				 	unregister_filesystem(&ext2_fs_type);
			
 
				 	destroy_inodecache();
			
 
				-	exit_ext2_xattr();
			
 
				 }
			
 
				 
			
 
				 MODULE_AUTHOR("Remy Card and others");
			
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -90,14 +90,12 @@
 
				 static int ext2_xattr_set2(struct inode *, struct buffer_head *,
			
 
				 			   struct ext2_xattr_header *);
			
 
				 
			
 
				-static int ext2_xattr_cache_insert(struct buffer_head *);
			
 
				+static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
			
 
				 static struct buffer_head *ext2_xattr_cache_find(struct inode *,
			
 
				 						 struct ext2_xattr_header *);
			
 
				 static void ext2_xattr_rehash(struct ext2_xattr_header *,
			
 
				 			      struct ext2_xattr_entry *);
			
 
				 
			
 
				-static struct mb_cache *ext2_xattr_cache;
			
 
				-
			
 
				 static const struct xattr_handler *ext2_xattr_handler_map[] = {
			
 
				 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
			
 
				 #ifdef CONFIG_EXT2_FS_POSIX_ACL
			
@@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
 
				 	size_t name_len, size;
			
 
				 	char *end;
			
 
				 	int error;
			
 
				+	struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
			
 
				 
			
 
				 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
			
 
				 		  name_index, name, buffer, (long)buffer_size);
			
@@ -196,7 +195,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
 
				 			goto found;
			
 
				 		entry = next;
			
 
				 	}
			
 
				-	if (ext2_xattr_cache_insert(bh))
			
 
				+	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
			
 
				 		ea_idebug(inode, "cache insert failed");
			
 
				 	error = -ENODATA;
			
 
				 	goto cleanup;
			
@@ -209,7 +208,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
 
				 	    le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
			
 
				 		goto bad_block;
			
 
				 
			
 
				-	if (ext2_xattr_cache_insert(bh))
			
 
				+	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
			
 
				 		ea_idebug(inode, "cache insert failed");
			
 
				 	if (buffer) {
			
 
				 		error = -ERANGE;
			
@@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 
				 	char *end;
			
 
				 	size_t rest = buffer_size;
			
 
				 	int error;
			
 
				+	struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
			
 
				 
			
 
				 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
			
 
				 		  buffer, (long)buffer_size);
			
@@ -281,7 +281,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 
				 			goto bad_block;
			
 
				 		entry = next;
			
 
				 	}
			
 
				-	if (ext2_xattr_cache_insert(bh))
			
 
				+	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
			
 
				 		ea_idebug(inode, "cache insert failed");
			
 
				 
			
 
				 	/* list the attribute names */
			
@@ -483,22 +483,23 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 
				 	/* Here we know that we can set the new attribute. */
			
 
				 
			
 
				 	if (header) {
			
 
				-		struct mb_cache_entry *ce;
			
 
				-
			
 
				 		/* assert(header == HDR(bh)); */
			
 
				-		ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
			
 
				-					bh->b_blocknr);
			
 
				 		lock_buffer(bh);
			
 
				 		if (header->h_refcount == cpu_to_le32(1)) {
			
 
				+			__u32 hash = le32_to_cpu(header->h_hash);
			
 
				+
			
 
				 			ea_bdebug(bh, "modifying in-place");
			
 
				-			if (ce)
			
 
				-				mb_cache_entry_free(ce);
			
 
				+			/*
			
 
				+			 * This must happen under buffer lock for
			
 
				+			 * ext2_xattr_set2() to reliably detect modified block
			
 
				+			 */
			
 
				+			mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
			
 
				+						    hash, bh->b_blocknr);
			
 
				+
			
 
				 			/* keep the buffer locked while modifying it. */
			
 
				 		} else {
			
 
				 			int offset;
			
 
				 
			
 
				-			if (ce)
			
 
				-				mb_cache_entry_release(ce);
			
 
				 			unlock_buffer(bh);
			
 
				 			ea_bdebug(bh, "cloning");
			
 
				 			header = kmalloc(bh->b_size, GFP_KERNEL);
			
@@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 
				 	struct super_block *sb = inode->i_sb;
			
 
				 	struct buffer_head *new_bh = NULL;
			
 
				 	int error;
			
 
				+	struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
			
 
				 
			
 
				 	if (header) {
			
 
				 		new_bh = ext2_xattr_cache_find(inode, header);
			
@@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 
				 			   don't need to change the reference count. */
			
 
				 			new_bh = old_bh;
			
 
				 			get_bh(new_bh);
			
 
				-			ext2_xattr_cache_insert(new_bh);
			
 
				+			ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
			
 
				 		} else {
			
 
				 			/* We need to allocate a new block */
			
 
				 			ext2_fsblk_t goal = ext2_group_first_block_no(sb,
			
@@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 
				 			memcpy(new_bh->b_data, header, new_bh->b_size);
			
 
				 			set_buffer_uptodate(new_bh);
			
 
				 			unlock_buffer(new_bh);
			
 
				-			ext2_xattr_cache_insert(new_bh);
			
 
				+			ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
			
 
				 			
			
 
				 			ext2_xattr_update_super_block(sb);
			
 
				 		}
			
@@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 
				 
			
 
				 	error = 0;
			
 
				 	if (old_bh && old_bh != new_bh) {
			
 
				-		struct mb_cache_entry *ce;
			
 
				-
			
 
				 		/*
			
 
				 		 * If there was an old block and we are no longer using it,
			
 
				 		 * release the old block.
			
 
				 		 */
			
 
				-		ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
			
 
				-					old_bh->b_blocknr);
			
 
				 		lock_buffer(old_bh);
			
 
				 		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
			
 
				+			__u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
			
 
				+
			
 
				+			/*
			
 
				+			 * This must happen under buffer lock for
			
 
				+			 * ext2_xattr_set2() to reliably detect freed block
			
 
				+			 */
			
 
				+			mb_cache_entry_delete_block(ext2_mb_cache,
			
 
				+						    hash, old_bh->b_blocknr);
			
 
				 			/* Free the old block. */
			
 
				-			if (ce)
			
 
				-				mb_cache_entry_free(ce);
			
 
				 			ea_bdebug(old_bh, "freeing");
			
 
				 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
			
 
				 			mark_inode_dirty(inode);
			
@@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 
				 		} else {
			
 
				 			/* Decrement the refcount only. */
			
 
				 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
			
 
				-			if (ce)
			
 
				-				mb_cache_entry_release(ce);
			
 
				 			dquot_free_block_nodirty(inode, 1);
			
 
				 			mark_inode_dirty(inode);
			
 
				 			mark_buffer_dirty(old_bh);
			
@@ -757,7 +759,6 @@ void
 
				 ext2_xattr_delete_inode(struct inode *inode)
			
 
				 {
			
 
				 	struct buffer_head *bh = NULL;
			
 
				-	struct mb_cache_entry *ce;
			
 
				 
			
 
				 	down_write(&EXT2_I(inode)->xattr_sem);
			
 
				 	if (!EXT2_I(inode)->i_file_acl)
			
@@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode)
 
				 			EXT2_I(inode)->i_file_acl);
			
 
				 		goto cleanup;
			
 
				 	}
			
 
				-	ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
			
 
				 	lock_buffer(bh);
			
 
				 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
			
 
				-		if (ce)
			
 
				-			mb_cache_entry_free(ce);
			
 
				+		__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
			
 
				+
			
 
				+		/*
			
 
				+		 * This must happen under buffer lock for ext2_xattr_set2() to
			
 
				+		 * reliably detect freed block
			
 
				+		 */
			
 
				+		mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
			
 
				+					    hash, bh->b_blocknr);
			
 
				 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
			
 
				 		get_bh(bh);
			
 
				 		bforget(bh);
			
 
				 		unlock_buffer(bh);
			
 
				 	} else {
			
 
				 		le32_add_cpu(&HDR(bh)->h_refcount, -1);
			
 
				-		if (ce)
			
 
				-			mb_cache_entry_release(ce);
			
 
				 		ea_bdebug(bh, "refcount now=%d",
			
 
				 			le32_to_cpu(HDR(bh)->h_refcount));
			
 
				 		unlock_buffer(bh);
			
@@ -805,18 +809,6 @@ ext2_xattr_delete_inode(struct inode *inode)
 
				 	up_write(&EXT2_I(inode)->xattr_sem);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * ext2_xattr_put_super()
			
 
				- *
			
 
				- * This is called when a file system is unmounted.
			
 
				- */
			
 
				-void
			
 
				-ext2_xattr_put_super(struct super_block *sb)
			
 
				-{
			
 
				-	mb_cache_shrink(sb->s_bdev);
			
 
				-}
			
 
				-
			
 
				-
			
 
				 /*
			
 
				  * ext2_xattr_cache_insert()
			
 
				  *
			
@@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb)
 
				  * Returns 0, or a negative error number on failure.
			
 
				  */
			
 
				 static int
			
 
				-ext2_xattr_cache_insert(struct buffer_head *bh)
			
 
				+ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
			
 
				 {
			
 
				 	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
			
 
				-	struct mb_cache_entry *ce;
			
 
				 	int error;
			
 
				 
			
 
				-	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
			
 
				-	if (!ce)
			
 
				-		return -ENOMEM;
			
 
				-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
			
 
				+	error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
			
 
				 	if (error) {
			
 
				-		mb_cache_entry_free(ce);
			
 
				 		if (error == -EBUSY) {
			
 
				 			ea_bdebug(bh, "already in cache (%d cache entries)",
			
 
				 				atomic_read(&ext2_xattr_cache->c_entry_count));
			
 
				 			error = 0;
			
 
				 		}
			
 
				-	} else {
			
 
				-		ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
			
 
				-			  atomic_read(&ext2_xattr_cache->c_entry_count));
			
 
				-		mb_cache_entry_release(ce);
			
 
				-	}
			
 
				+	} else
			
 
				+		ea_bdebug(bh, "inserting [%x]", (int)hash);
			
 
				 	return error;
			
 
				 }
			
 
				 
			
@@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 
				 {
			
 
				 	__u32 hash = le32_to_cpu(header->h_hash);
			
 
				 	struct mb_cache_entry *ce;
			
 
				+	struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
			
 
				 
			
 
				 	if (!header->h_hash)
			
 
				 		return NULL;  /* never share */
			
 
				 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
			
 
				 again:
			
 
				-	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
			
 
				-				       hash);
			
 
				+	ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
			
 
				 	while (ce) {
			
 
				 		struct buffer_head *bh;
			
 
				 
			
 
				-		if (IS_ERR(ce)) {
			
 
				-			if (PTR_ERR(ce) == -EAGAIN)
			
 
				-				goto again;
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				 		bh = sb_bread(inode->i_sb, ce->e_block);
			
 
				 		if (!bh) {
			
 
				 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
			
@@ -927,7 +905,21 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 
				 				inode->i_ino, (unsigned long) ce->e_block);
			
 
				 		} else {
			
 
				 			lock_buffer(bh);
			
 
				-			if (le32_to_cpu(HDR(bh)->h_refcount) >
			
 
				+			/*
			
 
				+			 * We have to be careful about races with freeing or
			
 
				+			 * rehashing of xattr block. Once we hold buffer lock
			
 
				+			 * xattr block's state is stable so we can check
			
 
				+			 * whether the block got freed / rehashed or not.
			
 
				+			 * Since we unhash mbcache entry under buffer lock when
			
 
				+			 * freeing / rehashing xattr block, checking whether
			
 
				+			 * entry is still hashed is reliable.
			
 
				+			 */
			
 
				+			if (hlist_bl_unhashed(&ce->e_hash_list)) {
			
 
				+				mb_cache_entry_put(ext2_mb_cache, ce);
			
 
				+				unlock_buffer(bh);
			
 
				+				brelse(bh);
			
 
				+				goto again;
			
 
				+			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
			
 
				 				   EXT2_XATTR_REFCOUNT_MAX) {
			
 
				 				ea_idebug(inode, "block %ld refcount %d>%d",
			
 
				 					  (unsigned long) ce->e_block,
			
@@ -936,13 +928,14 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 
				 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
			
 
				 				ea_bdebug(bh, "b_count=%d",
			
 
				 					  atomic_read(&(bh->b_count)));
			
 
				-				mb_cache_entry_release(ce);
			
 
				+				mb_cache_entry_touch(ext2_mb_cache, ce);
			
 
				+				mb_cache_entry_put(ext2_mb_cache, ce);
			
 
				 				return bh;
			
 
				 			}
			
 
				 			unlock_buffer(bh);
			
 
				 			brelse(bh);
			
 
				 		}
			
 
				-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
			
 
				+		ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
			
 
				 	}
			
 
				 	return NULL;
			
 
				 }
			
@@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 
				 
			
 
				 #undef BLOCK_HASH_SHIFT
			
 
				 
			
 
				-int __init
			
 
				-init_ext2_xattr(void)
			
 
				+#define HASH_BUCKET_BITS 10
			
 
				+
			
 
				+struct mb_cache *ext2_xattr_create_cache(void)
			
 
				 {
			
 
				-	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
			
 
				-	if (!ext2_xattr_cache)
			
 
				-		return -ENOMEM;
			
 
				-	return 0;
			
 
				+	return mb_cache_create(HASH_BUCKET_BITS);
			
 
				 }
			
 
				 
			
 
				-void
			
 
				-exit_ext2_xattr(void)
			
 
				+void ext2_xattr_destroy_cache(struct mb_cache *cache)
			
 
				 {
			
 
				-	mb_cache_destroy(ext2_xattr_cache);
			
 
				+	if (cache)
			
 
				+		mb_cache_destroy(cache);
			
 
				 }
			
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
 
				 #define EXT2_XATTR_SIZE(size) \
			
 
				 	(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
			
 
				 
			
 
				+struct mb_cache;
			
 
				+
			
 
				 # ifdef CONFIG_EXT2_FS_XATTR
			
 
				 
			
 
				 extern const struct xattr_handler ext2_xattr_user_handler;
			
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
 
				 extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
			
 
				 
			
 
				 extern void ext2_xattr_delete_inode(struct inode *);
			
 
				-extern void ext2_xattr_put_super(struct super_block *);
			
 
				 
			
 
				-extern int init_ext2_xattr(void);
			
 
				-extern void exit_ext2_xattr(void);
			
 
				+extern struct mb_cache *ext2_xattr_create_cache(void);
			
 
				+extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
			
 
				 
			
 
				 extern const struct xattr_handler *ext2_xattr_handlers[];
			
 
				 
			
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 
				 {
			
 
				 }
			
 
				 
			
 
				-static inline void
			
 
				-ext2_xattr_put_super(struct super_block *sb)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static inline int
			
 
				-init_ext2_xattr(void)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static inline void
			
 
				-exit_ext2_xattr(void)
			
 
				+static inline void ext2_xattr_destroy_cache(struct mb_cache *cache)
			
 
				 {
			
 
				 }
			
 
				 
			
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -41,6 +41,18 @@
 
				  * The fourth extended filesystem constants/structures
			
 
				  */
			
 
				 
			
 
				+/*
			
 
				+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
			
 
				+ * structures. these checks slow things down a lot
			
 
				+ */
			
 
				+#define AGGRESSIVE_CHECK__
			
 
				+
			
 
				+/*
			
 
				+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
			
 
				+ * bitmaps, maintains and uses them to check for double allocations
			
 
				+ */
			
 
				+#define DOUBLE_CHECK__
			
 
				+
			
 
				 /*
			
 
				  * Define EXT4FS_DEBUG to produce debug messages
			
 
				  */
			
@@ -182,9 +194,9 @@ typedef struct ext4_io_end {
 
				 	struct bio		*bio;		/* Linked list of completed
			
 
				 						 * bios covering the extent */
			
 
				 	unsigned int		flag;		/* unwritten or not */
			
 
				+	atomic_t		count;		/* reference counter */
			
 
				 	loff_t			offset;		/* offset in the file */
			
 
				 	ssize_t			size;		/* size of the extent */
			
 
				-	atomic_t		count;		/* reference counter */
			
 
				 } ext4_io_end_t;
			
 
				 
			
 
				 struct ext4_io_submit {
			
@@ -1024,13 +1036,8 @@ struct ext4_inode_info {
 
				 	 * transaction reserved
			
 
				 	 */
			
 
				 	struct list_head i_rsv_conversion_list;
			
 
				-	/*
			
 
				-	 * Completed IOs that need unwritten extents handling and don't have
			
 
				-	 * transaction reserved
			
 
				-	 */
			
 
				-	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
			
 
				-	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
			
 
				 	struct work_struct i_rsv_conversion_work;
			
 
				+	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
			
 
				 
			
 
				 	spinlock_t i_block_reservation_lock;
			
 
				 
			
@@ -1513,16 +1520,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
			
 
				-{
			
 
				-	return inode->i_private;
			
 
				-}
			
 
				-
			
 
				-static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
			
 
				-{
			
 
				-	inode->i_private = io;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Inode dynamic state flags
			
 
				  */
			
@@ -2506,12 +2503,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 
				 int ext4_inode_is_fast_symlink(struct inode *inode);
			
 
				 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
			
 
				 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
			
 
				-int ext4_get_block_write(struct inode *inode, sector_t iblock,
			
 
				-			 struct buffer_head *bh_result, int create);
			
 
				+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
			
 
				+			     struct buffer_head *bh_result, int create);
			
 
				 int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
			
 
				 			    struct buffer_head *bh_result, int create);
			
 
				 int ext4_get_block(struct inode *inode, sector_t iblock,
			
 
				-				struct buffer_head *bh_result, int create);
			
 
				+		   struct buffer_head *bh_result, int create);
			
 
				+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
			
 
				+		       struct buffer_head *bh_result, int create);
			
 
				 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
			
 
				 			   struct buffer_head *bh, int create);
			
 
				 int ext4_walk_page_buffers(handle_t *handle,
			
@@ -2559,6 +2558,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
 
				 					int used, int quota_claim);
			
 
				 extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
			
 
				 			      ext4_fsblk_t pblk, ext4_lblk_t len);
			
 
				+extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
			
 
				+				unsigned int map_len,
			
 
				+				struct extent_status *result);
			
 
				 
			
 
				 /* indirect.c */
			
 
				 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
			
@@ -3285,10 +3287,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
 
				 #define EXT4_WQ_HASH_SZ		37
			
 
				 #define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
			
 
				 					    EXT4_WQ_HASH_SZ])
			
 
				-#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
			
 
				-					     EXT4_WQ_HASH_SZ])
			
 
				 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
			
 
				-extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
			
 
				 
			
 
				 #define EXT4_RESIZING	0
			
 
				 extern int ext4_resize_begin(struct super_block *sb);
			
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
 
				  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				  * GNU General Public License for more details.
			
 
				  *
			
 
				- * You should have received a copy of the GNU General Public Licens
			
 
				+ * You should have received a copy of the GNU General Public License
			
 
				  * along with this program; if not, write to the Free Software
			
 
				  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
			
 
				  */
			
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
 
				  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				  * GNU General Public License for more details.
			
 
				  *
			
 
				- * You should have received a copy of the GNU General Public Licens
			
 
				+ * You should have received a copy of the GNU General Public License
			
 
				  * along with this program; if not, write to the Free Software
			
 
				  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
			
 
				  */
			
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 
				 	 */
			
 
				 	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
			
 
				 		return 0;
			
 
				+	/*
			
 
				+	 * The check for IO to unwritten extent is somewhat racy as we
			
 
				+	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
			
 
				+	 * dropping i_data_sem. But reserved blocks should save us in that
			
 
				+	 * case.
			
 
				+	 */
			
 
				 	if (ext4_ext_is_unwritten(ex1) &&
			
 
				 	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
			
 
				 	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
			
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * ext4_ext_put_gap_in_cache:
			
 
				- * calculate boundaries of the gap that the requested block fits into
			
 
				- * and cache this gap
			
 
				+ * ext4_ext_determine_hole - determine hole around given block
			
 
				+ * @inode:	inode we lookup in
			
 
				+ * @path:	path in extent tree to @lblk
			
 
				+ * @lblk:	pointer to logical block around which we want to determine hole
			
 
				+ *
			
 
				+ * Determine hole length (and start if easily possible) around given logical
			
 
				+ * block. We don't try too hard to find the beginning of the hole but @path
			
 
				+ * actually points to extent before @lblk, we provide it.
			
 
				+ *
			
 
				+ * The function returns the length of a hole starting at @lblk. We update @lblk
			
 
				+ * to the beginning of the hole if we managed to find it.
			
 
				  */
			
 
				-static void
			
 
				-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
			
 
				-				ext4_lblk_t block)
			
 
				+static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
			
 
				+					   struct ext4_ext_path *path,
			
 
				+					   ext4_lblk_t *lblk)
			
 
				 {
			
 
				 	int depth = ext_depth(inode);
			
 
				-	ext4_lblk_t len;
			
 
				-	ext4_lblk_t lblock;
			
 
				 	struct ext4_extent *ex;
			
 
				-	struct extent_status es;
			
 
				+	ext4_lblk_t len;
			
 
				 
			
 
				 	ex = path[depth].p_ext;
			
 
				 	if (ex == NULL) {
			
 
				 		/* there is no extent yet, so gap is [0;-] */
			
 
				-		lblock = 0;
			
 
				+		*lblk = 0;
			
 
				 		len = EXT_MAX_BLOCKS;
			
 
				-		ext_debug("cache gap(whole file):");
			
 
				-	} else if (block < le32_to_cpu(ex->ee_block)) {
			
 
				-		lblock = block;
			
 
				-		len = le32_to_cpu(ex->ee_block) - block;
			
 
				-		ext_debug("cache gap(before): %u [%u:%u]",
			
 
				-				block,
			
 
				-				le32_to_cpu(ex->ee_block),
			
 
				-				 ext4_ext_get_actual_len(ex));
			
 
				-	} else if (block >= le32_to_cpu(ex->ee_block)
			
 
				+	} else if (*lblk < le32_to_cpu(ex->ee_block)) {
			
 
				+		len = le32_to_cpu(ex->ee_block) - *lblk;
			
 
				+	} else if (*lblk >= le32_to_cpu(ex->ee_block)
			
 
				 			+ ext4_ext_get_actual_len(ex)) {
			
 
				 		ext4_lblk_t next;
			
 
				-		lblock = le32_to_cpu(ex->ee_block)
			
 
				-			+ ext4_ext_get_actual_len(ex);
			
 
				 
			
 
				+		*lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
			
 
				 		next = ext4_ext_next_allocated_block(path);
			
 
				-		ext_debug("cache gap(after): [%u:%u] %u",
			
 
				-				le32_to_cpu(ex->ee_block),
			
 
				-				ext4_ext_get_actual_len(ex),
			
 
				-				block);
			
 
				-		BUG_ON(next == lblock);
			
 
				-		len = next - lblock;
			
 
				+		BUG_ON(next == *lblk);
			
 
				+		len = next - *lblk;
			
 
				 	} else {
			
 
				 		BUG();
			
 
				 	}
			
 
				+	return len;
			
 
				+}
			
 
				 
			
 
				-	ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
			
 
				+/*
			
 
				+ * ext4_ext_put_gap_in_cache:
			
 
				+ * calculate boundaries of the gap that the requested block fits into
			
 
				+ * and cache this gap
			
 
				+ */
			
 
				+static void
			
 
				+ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
			
 
				+			  ext4_lblk_t hole_len)
			
 
				+{
			
 
				+	struct extent_status es;
			
 
				+
			
 
				+	ext4_es_find_delayed_extent_range(inode, hole_start,
			
 
				+					  hole_start + hole_len - 1, &es);
			
 
				 	if (es.es_len) {
			
 
				 		/* There's delayed extent containing lblock? */
			
 
				-		if (es.es_lblk <= lblock)
			
 
				+		if (es.es_lblk <= hole_start)
			
 
				 			return;
			
 
				-		len = min(es.es_lblk - lblock, len);
			
 
				+		hole_len = min(es.es_lblk - hole_start, hole_len);
			
 
				 	}
			
 
				-	ext_debug(" -> %u:%u\n", lblock, len);
			
 
				-	ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
			
 
				+	ext_debug(" -> %u:%u\n", hole_start, hole_len);
			
 
				+	ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
			
 
				+			      EXTENT_STATUS_HOLE);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 
				 static int
			
 
				 convert_initialized_extent(handle_t *handle, struct inode *inode,
			
 
				 			   struct ext4_map_blocks *map,
			
 
				-			   struct ext4_ext_path **ppath, int flags,
			
 
				+			   struct ext4_ext_path **ppath,
			
 
				 			   unsigned int allocated)
			
 
				 {
			
 
				 	struct ext4_ext_path *path = *ppath;
			
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 
				 	struct ext4_ext_path *path = *ppath;
			
 
				 	int ret = 0;
			
 
				 	int err = 0;
			
 
				-	ext4_io_end_t *io = ext4_inode_aio(inode);
			
 
				 
			
 
				 	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
			
 
				 		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
			
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 
				 					 flags | EXT4_GET_BLOCKS_CONVERT);
			
 
				 		if (ret <= 0)
			
 
				 			goto out;
			
 
				-		/*
			
 
				-		 * Flag the inode(non aio case) or end_io struct (aio case)
			
 
				-		 * that this IO needs to conversion to written when IO is
			
 
				-		 * completed
			
 
				-		 */
			
 
				-		if (io)
			
 
				-			ext4_set_io_unwritten_flag(inode, io);
			
 
				-		else
			
 
				-			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
			
 
				 		map->m_flags |= EXT4_MAP_UNWRITTEN;
			
 
				 		goto out;
			
 
				 	}
			
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
				 	unsigned int allocated = 0, offset = 0;
			
 
				 	unsigned int allocated_clusters = 0;
			
 
				 	struct ext4_allocation_request ar;
			
 
				-	ext4_io_end_t *io = ext4_inode_aio(inode);
			
 
				 	ext4_lblk_t cluster_offset;
			
 
				-	int set_unwritten = 0;
			
 
				 	bool map_from_cluster = false;
			
 
				 
			
 
				 	ext_debug("blocks %u/%u requested for inode %lu\n",
			
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
				 			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
			
 
				 				allocated = convert_initialized_extent(
			
 
				 						handle, inode, map, &path,
			
 
				-						flags, allocated);
			
 
				+						allocated);
			
 
				 				goto out2;
			
 
				 			} else if (!ext4_ext_is_unwritten(ex))
			
 
				 				goto out;
			
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
				 	 * we couldn't try to create block if create flag is zero
			
 
				 	 */
			
 
				 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
			
 
				+		ext4_lblk_t hole_start, hole_len;
			
 
				+
			
 
				+		hole_start = map->m_lblk;
			
 
				+		hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
			
 
				 		/*
			
 
				 		 * put just found gap into cache to speed up
			
 
				 		 * subsequent requests
			
 
				 		 */
			
 
				-		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
			
 
				+		ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
			
 
				+
			
 
				+		/* Update hole_len to reflect hole size after map->m_lblk */
			
 
				+		if (hole_start != map->m_lblk)
			
 
				+			hole_len -= map->m_lblk - hole_start;
			
 
				+		map->m_pblk = 0;
			
 
				+		map->m_len = min_t(unsigned int, map->m_len, hole_len);
			
 
				+
			
 
				 		goto out2;
			
 
				 	}
			
 
				 
			
@@ -4482,15 +4497,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
				 	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
			
 
				 		ext4_ext_mark_unwritten(&newex);
			
 
				 		map->m_flags |= EXT4_MAP_UNWRITTEN;
			
 
				-		/*
			
 
				-		 * io_end structure was created for every IO write to an
			
 
				-		 * unwritten extent. To avoid unnecessary conversion,
			
 
				-		 * here we flag the IO that really needs the conversion.
			
 
				-		 * For non asycn direct IO case, flag the inode state
			
 
				-		 * that we need to perform conversion when IO is done.
			
 
				-		 */
			
 
				-		if (flags & EXT4_GET_BLOCKS_PRE_IO)
			
 
				-			set_unwritten = 1;
			
 
				 	}
			
 
				 
			
 
				 	err = 0;
			
@@ -4501,14 +4507,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
				 		err = ext4_ext_insert_extent(handle, inode, &path,
			
 
				 					     &newex, flags);
			
 
				 
			
 
				-	if (!err && set_unwritten) {
			
 
				-		if (io)
			
 
				-			ext4_set_io_unwritten_flag(inode, io);
			
 
				-		else
			
 
				-			ext4_set_inode_state(inode,
			
 
				-					     EXT4_STATE_DIO_UNWRITTEN);
			
 
				-	}
			
 
				-
			
 
				 	if (err && free_on_err) {
			
 
				 		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
			
 
				 			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
			
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 
				 		es->es_lblk = es1->es_lblk;
			
 
				 		es->es_len = es1->es_len;
			
 
				 		es->es_pblk = es1->es_pblk;
			
 
				-		if (!ext4_es_is_referenced(es))
			
 
				-			ext4_es_set_referenced(es);
			
 
				+		if (!ext4_es_is_referenced(es1))
			
 
				+			ext4_es_set_referenced(es1);
			
 
				 		stats->es_stats_cache_hits++;
			
 
				 	} else {
			
 
				 		stats->es_stats_cache_misses++;
			
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
				 {
			
 
				 	struct file *file = iocb->ki_filp;
			
 
				 	struct inode *inode = file_inode(iocb->ki_filp);
			
 
				-	struct mutex *aio_mutex = NULL;
			
 
				 	struct blk_plug plug;
			
 
				 	int o_direct = iocb->ki_flags & IOCB_DIRECT;
			
 
				+	int unaligned_aio = 0;
			
 
				 	int overwrite = 0;
			
 
				 	ssize_t ret;
			
 
				 
			
 
				+	inode_lock(inode);
			
 
				+	ret = generic_write_checks(iocb, from);
			
 
				+	if (ret <= 0)
			
 
				+		goto out;
			
 
				+
			
 
				 	/*
			
 
				-	 * Unaligned direct AIO must be serialized; see comment above
			
 
				-	 * In the case of O_APPEND, assume that we must always serialize
			
 
				+	 * Unaligned direct AIO must be serialized among each other as zeroing
			
 
				+	 * of partial blocks of two competing unaligned AIOs can result in data
			
 
				+	 * corruption.
			
 
				 	 */
			
 
				-	if (o_direct &&
			
 
				-	    ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
			
 
				+	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
			
 
				 	    !is_sync_kiocb(iocb) &&
			
 
				-	    (iocb->ki_flags & IOCB_APPEND ||
			
 
				-	     ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
			
 
				-		aio_mutex = ext4_aio_mutex(inode);
			
 
				-		mutex_lock(aio_mutex);
			
 
				+	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
			
 
				+		unaligned_aio = 1;
			
 
				 		ext4_unwritten_wait(inode);
			
 
				 	}
			
 
				 
			
 
				-	inode_lock(inode);
			
 
				-	ret = generic_write_checks(iocb, from);
			
 
				-	if (ret <= 0)
			
 
				-		goto out;
			
 
				-
			
 
				 	/*
			
 
				 	 * If we have encountered a bitmap-format file, the size limit
			
 
				 	 * is smaller than s_maxbytes, which is for extent-mapped files.
			
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
				 		blk_start_plug(&plug);
			
 
				 
			
 
				 		/* check whether we do a DIO overwrite or not */
			
 
				-		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
			
 
				+		if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
			
 
				 		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
			
 
				 			struct ext4_map_blocks map;
			
 
				 			unsigned int blkbits = inode->i_blkbits;
			
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
				 	if (o_direct)
			
 
				 		blk_finish_plug(&plug);
			
 
				 
			
 
				-	if (aio_mutex)
			
 
				-		mutex_unlock(aio_mutex);
			
 
				 	return ret;
			
 
				 
			
 
				 out:
			
 
				 	inode_unlock(inode);
			
 
				-	if (aio_mutex)
			
 
				-		mutex_unlock(aio_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 
				  */
			
 
				 static int ext4_find_unwritten_pgoff(struct inode *inode,
			
 
				 				     int whence,
			
 
				-				     struct ext4_map_blocks *map,
			
 
				+				     ext4_lblk_t end_blk,
			
 
				 				     loff_t *offset)
			
 
				 {
			
 
				 	struct pagevec pvec;
			
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 
				 	blkbits = inode->i_sb->s_blocksize_bits;
			
 
				 	startoff = *offset;
			
 
				 	lastoff = startoff;
			
 
				-	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
			
 
				+	endoff = (loff_t)end_blk << blkbits;
			
 
				 
			
 
				 	index = startoff >> PAGE_CACHE_SHIFT;
			
 
				 	end = endoff >> PAGE_CACHE_SHIFT;
			
@@ -550,12 +544,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 
				 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
			
 
				 {
			
 
				 	struct inode *inode = file->f_mapping->host;
			
 
				-	struct ext4_map_blocks map;
			
 
				 	struct extent_status es;
			
 
				 	ext4_lblk_t start, last, end;
			
 
				 	loff_t dataoff, isize;
			
 
				 	int blkbits;
			
 
				-	int ret = 0;
			
 
				+	int ret;
			
 
				 
			
 
				 	inode_lock(inode);
			
 
				 
			
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 
				 	dataoff = offset;
			
 
				 
			
 
				 	do {
			
 
				-		map.m_lblk = last;
			
 
				-		map.m_len = end - last + 1;
			
 
				-		ret = ext4_map_blocks(NULL, inode, &map, 0);
			
 
				-		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
			
 
				-			if (last != start)
			
 
				-				dataoff = (loff_t)last << blkbits;
			
 
				-			break;
			
 
				+		ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
			
 
				+		if (ret <= 0) {
			
 
				+			/* No extent found -> no data */
			
 
				+			if (ret == 0)
			
 
				+				ret = -ENXIO;
			
 
				+			inode_unlock(inode);
			
 
				+			return ret;
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-		 * If there is a delay extent at this offset,
			
 
				-		 * it will be as a data.
			
 
				-		 */
			
 
				-		ext4_es_find_delayed_extent_range(inode, last, last, &es);
			
 
				-		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
			
 
				-			if (last != start)
			
 
				-				dataoff = (loff_t)last << blkbits;
			
 
				+		last = es.es_lblk;
			
 
				+		if (last != start)
			
 
				+			dataoff = (loff_t)last << blkbits;
			
 
				+		if (!ext4_es_is_unwritten(&es))
			
 
				 			break;
			
 
				-		}
			
 
				 
			
 
				 		/*
			
 
				 		 * If there is a unwritten extent at this offset,
			
 
				 		 * it will be as a data or a hole according to page
			
 
				 		 * cache that has data or not.
			
 
				 		 */
			
 
				-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
			
 
				-			int unwritten;
			
 
				-			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
			
 
				-							      &map, &dataoff);
			
 
				-			if (unwritten)
			
 
				-				break;
			
 
				-		}
			
 
				-
			
 
				-		last++;
			
 
				+		if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
			
 
				+					      es.es_lblk + es.es_len, &dataoff))
			
 
				+			break;
			
 
				+		last += es.es_len;
			
 
				 		dataoff = (loff_t)last << blkbits;
			
 
				+		cond_resched();
			
 
				 	} while (last <= end);
			
 
				 
			
 
				 	inode_unlock(inode);
			
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 
				 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
			
 
				 {
			
 
				 	struct inode *inode = file->f_mapping->host;
			
 
				-	struct ext4_map_blocks map;
			
 
				 	struct extent_status es;
			
 
				 	ext4_lblk_t start, last, end;
			
 
				 	loff_t holeoff, isize;
			
 
				 	int blkbits;
			
 
				-	int ret = 0;
			
 
				+	int ret;
			
 
				 
			
 
				 	inode_lock(inode);
			
 
				 
			
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 
				 	holeoff = offset;
			
 
				 
			
 
				 	do {
			
 
				-		map.m_lblk = last;
			
 
				-		map.m_len = end - last + 1;
			
 
				-		ret = ext4_map_blocks(NULL, inode, &map, 0);
			
 
				-		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
			
 
				-			last += ret;
			
 
				-			holeoff = (loff_t)last << blkbits;
			
 
				-			continue;
			
 
				+		ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
			
 
				+		if (ret < 0) {
			
 
				+			inode_unlock(inode);
			
 
				+			return ret;
			
 
				 		}
			
 
				-
			
 
				-		/*
			
 
				-		 * If there is a delay extent at this offset,
			
 
				-		 * we will skip this extent.
			
 
				-		 */
			
 
				-		ext4_es_find_delayed_extent_range(inode, last, last, &es);
			
 
				-		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
			
 
				-			last = es.es_lblk + es.es_len;
			
 
				-			holeoff = (loff_t)last << blkbits;
			
 
				-			continue;
			
 
				+		/* Found a hole? */
			
 
				+		if (ret == 0 || es.es_lblk > last) {
			
 
				+			if (last != start)
			
 
				+				holeoff = (loff_t)last << blkbits;
			
 
				+			break;
			
 
				 		}
			
 
				-
			
 
				 		/*
			
 
				 		 * If there is a unwritten extent at this offset,
			
 
				 		 * it will be as a data or a hole according to page
			
 
				 		 * cache that has data or not.
			
 
				 		 */
			
 
				-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
			
 
				-			int unwritten;
			
 
				-			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
			
 
				-							      &map, &holeoff);
			
 
				-			if (!unwritten) {
			
 
				-				last += ret;
			
 
				-				holeoff = (loff_t)last << blkbits;
			
 
				-				continue;
			
 
				-			}
			
 
				-		}
			
 
				+		if (ext4_es_is_unwritten(&es) &&
			
 
				+		    ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
			
 
				+					      last + es.es_len, &holeoff))
			
 
				+			break;
			
 
				 
			
 
				-		/* find a hole */
			
 
				-		break;
			
 
				+		last += es.es_len;
			
 
				+		holeoff = (loff_t)last << blkbits;
			
 
				+		cond_resched();
			
 
				 	} while (last <= end);
			
 
				 
			
 
				 	inode_unlock(inode);
			
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 
				 	sbi = EXT4_SB(sb);
			
 
				 
			
 
				 	/*
			
 
				-	 * Initalize owners and quota early so that we don't have to account
			
 
				+	 * Initialize owners and quota early so that we don't have to account
			
 
				 	 * for quota initialization worst case in standard inode creating
			
 
				 	 * transaction
			
 
				 	 */
			
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 
				 		goto got_it;
			
 
				 	}
			
 
				 
			
 
				-	/* Next simple case - plain lookup or failed read of indirect block */
			
 
				-	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
			
 
				+	/* Next simple case - plain lookup failed */
			
 
				+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
			
 
				+		unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
			
 
				+		int i;
			
 
				+
			
 
				+		/* Count number blocks in a subtree under 'partial' */
			
 
				+		count = 1;
			
 
				+		for (i = 0; partial + i != chain + depth - 1; i++)
			
 
				+			count *= epb;
			
 
				+		/* Fill in size of a hole we found */
			
 
				+		map->m_pblk = 0;
			
 
				+		map->m_len = min_t(unsigned int, map->m_len, count);
			
 
				+		goto cleanup;
			
 
				+	}
			
 
				+
			
 
				+	/* Failed read of indirect block */
			
 
				+	if (err == -EIO)
			
 
				 		goto cleanup;
			
 
				 
			
 
				 	/*
			
@@ -693,21 +708,21 @@ ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 		}
			
 
				 		if (IS_DAX(inode))
			
 
				 			ret = dax_do_io(iocb, inode, iter, offset,
			
 
				-					ext4_get_block, NULL, 0);
			
 
				+					ext4_dio_get_block, NULL, 0);
			
 
				 		else
			
 
				 			ret = __blockdev_direct_IO(iocb, inode,
			
 
				 						   inode->i_sb->s_bdev, iter,
			
 
				-						   offset, ext4_get_block, NULL,
			
 
				-						   NULL, 0);
			
 
				+						   offset, ext4_dio_get_block,
			
 
				+						   NULL, NULL, 0);
			
 
				 		inode_dio_end(inode);
			
 
				 	} else {
			
 
				 locked:
			
 
				 		if (IS_DAX(inode))
			
 
				 			ret = dax_do_io(iocb, inode, iter, offset,
			
 
				-					ext4_get_block, NULL, DIO_LOCKING);
			
 
				+					ext4_dio_get_block, NULL, DIO_LOCKING);
			
 
				 		else
			
 
				 			ret = blockdev_direct_IO(iocb, inode, iter, offset,
			
 
				-						 ext4_get_block);
			
 
				+						 ext4_dio_get_block);
			
 
				 
			
 
				 		if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
			
 
				 			loff_t isize = i_size_read(inode);
			
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -581,9 +581,10 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 
			
 
				-	if (ext4_should_dioread_nolock(inode))
			
 
				-		ret = __block_write_begin(page, from, to, ext4_get_block_write);
			
 
				-	else
			
 
				+	if (ext4_should_dioread_nolock(inode)) {
			
 
				+		ret = __block_write_begin(page, from, to,
			
 
				+					  ext4_get_block_unwritten);
			
 
				+	} else
			
 
				 		ret = __block_write_begin(page, from, to, ext4_get_block);
			
 
				 
			
 
				 	if (!ret && ext4_should_journal_data(inode)) {
			
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
 
				-	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
			
 
				 	err = ext4_mark_inode_dirty(handle, dir);
			
 
				 	if (unlikely(err))
			
 
				 		goto out;
			
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
 
				 		}
			
 
				 		truncate_inode_pages_final(&inode->i_data);
			
 
				 
			
 
				-		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
			
 
				 		goto no_delete;
			
 
				 	}
			
 
				 
			
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
 
				 		ext4_begin_ordered_truncate(inode, 0);
			
 
				 	truncate_inode_pages_final(&inode->i_data);
			
 
				 
			
 
				-	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
			
 
				-
			
 
				 	/*
			
 
				 	 * Protect us against freezing - iput() caller didn't have to have any
			
 
				 	 * protection against it
			
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 
				  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
			
 
				  * based files
			
 
				  *
			
 
				- * On success, it returns the number of blocks being mapped or allocated.
			
 
				- * if create==0 and the blocks are pre-allocated and unwritten block,
			
 
				- * the result buffer head is unmapped. If the create ==1, it will make sure
			
 
				- * the buffer head is mapped.
			
 
				+ * On success, it returns the number of blocks being mapped or allocated.  if
			
 
				+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
			
 
				+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
			
 
				  *
			
 
				  * It returns 0 if plain look up failed (blocks have not been allocated), in
			
 
				- * that case, buffer head is unmapped
			
 
				+ * that case, @map is returned as unmapped but we still do fill map->m_len to
			
 
				+ * indicate the length of a hole starting at map->m_lblk.
			
 
				  *
			
 
				  * It returns the error in case of allocation failure.
			
 
				  */
			
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
				 				retval = map->m_len;
			
 
				 			map->m_len = retval;
			
 
				 		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
			
 
				+			map->m_pblk = 0;
			
 
				+			retval = es.es_len - (map->m_lblk - es.es_lblk);
			
 
				+			if (retval > map->m_len)
			
 
				+				retval = map->m_len;
			
 
				+			map->m_len = retval;
			
 
				 			retval = 0;
			
 
				 		} else {
			
 
				 			BUG_ON(1);
			
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
 
				 		 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
			
 
				 }
			
 
				 
			
 
				-/* Maximum number of blocks we map for direct IO at once. */
			
 
				-#define DIO_MAX_BLOCKS 4096
			
 
				-
			
 
				 static int _ext4_get_block(struct inode *inode, sector_t iblock,
			
 
				 			   struct buffer_head *bh, int flags)
			
 
				 {
			
 
				-	handle_t *handle = ext4_journal_current_handle();
			
 
				 	struct ext4_map_blocks map;
			
 
				-	int ret = 0, started = 0;
			
 
				-	int dio_credits;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	if (ext4_has_inline_data(inode))
			
 
				 		return -ERANGE;
			
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
				 	map.m_lblk = iblock;
			
 
				 	map.m_len = bh->b_size >> inode->i_blkbits;
			
 
				 
			
 
				-	if (flags && !handle) {
			
 
				-		/* Direct IO write... */
			
 
				-		if (map.m_len > DIO_MAX_BLOCKS)
			
 
				-			map.m_len = DIO_MAX_BLOCKS;
			
 
				-		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
			
 
				-		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
			
 
				-					    dio_credits);
			
 
				-		if (IS_ERR(handle)) {
			
 
				-			ret = PTR_ERR(handle);
			
 
				-			return ret;
			
 
				-		}
			
 
				-		started = 1;
			
 
				-	}
			
 
				-
			
 
				-	ret = ext4_map_blocks(handle, inode, &map, flags);
			
 
				+	ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
			
 
				+			      flags);
			
 
				 	if (ret > 0) {
			
 
				-		ext4_io_end_t *io_end = ext4_inode_aio(inode);
			
 
				-
			
 
				 		map_bh(bh, inode->i_sb, map.m_pblk);
			
 
				 		ext4_update_bh_state(bh, map.m_flags);
			
 
				-		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
			
 
				-			set_buffer_defer_completion(bh);
			
 
				 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
			
 
				 		ret = 0;
			
 
				 	}
			
 
				-	if (started)
			
 
				-		ext4_journal_stop(handle);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -768,6 +746,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 
				 			       create ? EXT4_GET_BLOCKS_CREATE : 0);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Get block function used when preparing for buffered write if we require
			
 
				+ * creating an unwritten extent if blocks haven't been allocated.  The extent
			
 
				+ * will be converted to written after the IO is complete.
			
 
				+ */
			
 
				+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
			
 
				+			     struct buffer_head *bh_result, int create)
			
 
				+{
			
 
				+	ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
			
 
				+		   inode->i_ino, create);
			
 
				+	return _ext4_get_block(inode, iblock, bh_result,
			
 
				+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
			
 
				+}
			
 
				+
			
 
				+/* Maximum number of blocks we map for direct IO at once. */
			
 
				+#define DIO_MAX_BLOCKS 4096
			
 
				+
			
 
				+static handle_t *start_dio_trans(struct inode *inode,
			
 
				+				 struct buffer_head *bh_result)
			
 
				+{
			
 
				+	int dio_credits;
			
 
				+
			
 
				+	/* Trim mapping request to maximum we can map at once for DIO */
			
 
				+	if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
			
 
				+		bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
			
 
				+	dio_credits = ext4_chunk_trans_blocks(inode,
			
 
				+				      bh_result->b_size >> inode->i_blkbits);
			
 
				+	return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
			
 
				+}
			
 
				+
			
 
				+/* Get block function for DIO reads and writes to inodes without extents */
			
 
				+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
			
 
				+		       struct buffer_head *bh, int create)
			
 
				+{
			
 
				+	handle_t *handle;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* We don't expect handle for direct IO */
			
 
				+	WARN_ON_ONCE(ext4_journal_current_handle());
			
 
				+
			
 
				+	if (create) {
			
 
				+		handle = start_dio_trans(inode, bh);
			
 
				+		if (IS_ERR(handle))
			
 
				+			return PTR_ERR(handle);
			
 
				+	}
			
 
				+	ret = _ext4_get_block(inode, iblock, bh,
			
 
				+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
			
 
				+	if (create)
			
 
				+		ext4_journal_stop(handle);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get block function for AIO DIO writes when we create unwritten extent if
			
 
				+ * blocks are not allocated yet. The extent will be converted to written
			
 
				+ * after IO is complete.
			
 
				+ */
			
 
				+static int ext4_dio_get_block_unwritten_async(struct inode *inode,
			
 
				+		sector_t iblock, struct buffer_head *bh_result,	int create)
			
 
				+{
			
 
				+	handle_t *handle;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* We don't expect handle for direct IO */
			
 
				+	WARN_ON_ONCE(ext4_journal_current_handle());
			
 
				+
			
 
				+	handle = start_dio_trans(inode, bh_result);
			
 
				+	if (IS_ERR(handle))
			
 
				+		return PTR_ERR(handle);
			
 
				+	ret = _ext4_get_block(inode, iblock, bh_result,
			
 
				+			      EXT4_GET_BLOCKS_IO_CREATE_EXT);
			
 
				+	ext4_journal_stop(handle);
			
 
				+
			
 
				+	/*
			
 
				+	 * When doing DIO using unwritten extents, we need io_end to convert
			
 
				+	 * unwritten extents to written on IO completion. We allocate io_end
			
 
				+	 * once we spot unwritten extent and store it in b_private. Generic
			
 
				+	 * DIO code keeps b_private set and furthermore passes the value to
			
 
				+	 * our completion callback in 'private' argument.
			
 
				+	 */
			
 
				+	if (!ret && buffer_unwritten(bh_result)) {
			
 
				+		if (!bh_result->b_private) {
			
 
				+			ext4_io_end_t *io_end;
			
 
				+
			
 
				+			io_end = ext4_init_io_end(inode, GFP_KERNEL);
			
 
				+			if (!io_end)
			
 
				+				return -ENOMEM;
			
 
				+			bh_result->b_private = io_end;
			
 
				+			ext4_set_io_unwritten_flag(inode, io_end);
			
 
				+		}
			
 
				+		set_buffer_defer_completion(bh_result);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get block function for non-AIO DIO writes when we create unwritten extent if
			
 
				+ * blocks are not allocated yet. The extent will be converted to written
			
 
				+ * after IO is complete from ext4_ext_direct_IO() function.
			
 
				+ */
			
 
				+static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
			
 
				+		sector_t iblock, struct buffer_head *bh_result,	int create)
			
 
				+{
			
 
				+	handle_t *handle;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* We don't expect handle for direct IO */
			
 
				+	WARN_ON_ONCE(ext4_journal_current_handle());
			
 
				+
			
 
				+	handle = start_dio_trans(inode, bh_result);
			
 
				+	if (IS_ERR(handle))
			
 
				+		return PTR_ERR(handle);
			
 
				+	ret = _ext4_get_block(inode, iblock, bh_result,
			
 
				+			      EXT4_GET_BLOCKS_IO_CREATE_EXT);
			
 
				+	ext4_journal_stop(handle);
			
 
				+
			
 
				+	/*
			
 
				+	 * Mark inode as having pending DIO writes to unwritten extents.
			
 
				+	 * ext4_ext_direct_IO() checks this flag and converts extents to
			
 
				+	 * written.
			
 
				+	 */
			
 
				+	if (!ret && buffer_unwritten(bh_result))
			
 
				+		ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
			
 
				+		   struct buffer_head *bh_result, int create)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
			
 
				+		   inode->i_ino, create);
			
 
				+	/* We don't expect handle for direct IO */
			
 
				+	WARN_ON_ONCE(ext4_journal_current_handle());
			
 
				+
			
 
				+	ret = _ext4_get_block(inode, iblock, bh_result, 0);
			
 
				+	/*
			
 
				+	 * Blocks should have been preallocated! ext4_file_write_iter() checks
			
 
				+	 * that.
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 /*
			
 
				  * `handle' can be NULL if create is zero
			
 
				  */
			
@@ -1079,13 +1206,14 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 
				 #ifdef CONFIG_EXT4_FS_ENCRYPTION
			
 
				 	if (ext4_should_dioread_nolock(inode))
			
 
				 		ret = ext4_block_write_begin(page, pos, len,
			
 
				-					     ext4_get_block_write);
			
 
				+					     ext4_get_block_unwritten);
			
 
				 	else
			
 
				 		ret = ext4_block_write_begin(page, pos, len,
			
 
				 					     ext4_get_block);
			
 
				 #else
			
 
				 	if (ext4_should_dioread_nolock(inode))
			
 
				-		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
			
 
				+		ret = __block_write_begin(page, pos, len,
			
 
				+					  ext4_get_block_unwritten);
			
 
				 	else
			
 
				 		ret = __block_write_begin(page, pos, len, ext4_get_block);
			
 
				 #endif
			
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 
				 		return try_to_free_buffers(page);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * ext4_get_block used when preparing for a DIO write or buffer write.
			
 
				- * We allocate an uinitialized extent if blocks haven't been allocated.
			
 
				- * The extent will be converted to initialized after the IO is complete.
			
 
				- */
			
 
				-int ext4_get_block_write(struct inode *inode, sector_t iblock,
			
 
				-		   struct buffer_head *bh_result, int create)
			
 
				-{
			
 
				-	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
			
 
				-		   inode->i_ino, create);
			
 
				-	return _ext4_get_block(inode, iblock, bh_result,
			
 
				-			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
			
 
				-}
			
 
				-
			
 
				-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
			
 
				-		   struct buffer_head *bh_result, int create)
			
 
				-{
			
 
				-	int ret;
			
 
				-
			
 
				-	ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
			
 
				-		   inode->i_ino, create);
			
 
				-	ret = _ext4_get_block(inode, iblock, bh_result, 0);
			
 
				-	/*
			
 
				-	 * Blocks should have been preallocated! ext4_file_write_iter() checks
			
 
				-	 * that.
			
 
				-	 */
			
 
				-	WARN_ON_ONCE(!buffer_mapped(bh_result));
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				 #ifdef CONFIG_FS_DAX
			
 
				 int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
			
 
				 			    struct buffer_head *bh_result, int create)
			
@@ -3179,13 +3276,12 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
 
				 	WARN_ON_ONCE(ret == 0 && create);
			
 
				 	if (ret > 0) {
			
 
				 		map_bh(bh_result, inode->i_sb, map.m_pblk);
			
 
				-		bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
			
 
				-					map.m_flags;
			
 
				 		/*
			
 
				 		 * At least for now we have to clear BH_New so that DAX code
			
 
				 		 * doesn't attempt to zero blocks again in a racy way.
			
 
				 		 */
			
 
				-		bh_result->b_state &= ~(1 << BH_New);
			
 
				+		map.m_flags &= ~EXT4_MAP_NEW;
			
 
				+		ext4_update_bh_state(bh_result, map.m_flags);
			
 
				 		bh_result->b_size = map.m_len << inode->i_blkbits;
			
 
				 		ret = 0;
			
 
				 	}
			
@@ -3196,7 +3292,7 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
 
				 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
			
 
				 			    ssize_t size, void *private)
			
 
				 {
			
 
				-        ext4_io_end_t *io_end = iocb->private;
			
 
				+        ext4_io_end_t *io_end = private;
			
 
				 
			
 
				 	/* if not async direct IO just return */
			
 
				 	if (!io_end)
			
@@ -3204,10 +3300,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
				 
			
 
				 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
			
 
				 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
			
 
				- 		  iocb->private, io_end->inode->i_ino, iocb, offset,
			
 
				-		  size);
			
 
				+		  io_end, io_end->inode->i_ino, iocb, offset, size);
			
 
				 
			
 
				-	iocb->private = NULL;
			
 
				 	io_end->offset = offset;
			
 
				 	io_end->size = size;
			
 
				 	ext4_put_io_end(io_end);
			
@@ -3243,7 +3337,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 	get_block_t *get_block_func = NULL;
			
 
				 	int dio_flags = 0;
			
 
				 	loff_t final_size = offset + count;
			
 
				-	ext4_io_end_t *io_end = NULL;
			
 
				 
			
 
				 	/* Use the old path for reads and writes beyond i_size. */
			
 
				 	if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
			
@@ -3268,16 +3361,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 	/*
			
 
				 	 * We could direct write to holes and fallocate.
			
 
				 	 *
			
 
				-	 * Allocated blocks to fill the hole are marked as
			
 
				-	 * unwritten to prevent parallel buffered read to expose
			
 
				-	 * the stale data before DIO complete the data IO.
			
 
				+	 * Allocated blocks to fill the hole are marked as unwritten to prevent
			
 
				+	 * parallel buffered read to expose the stale data before DIO complete
			
 
				+	 * the data IO.
			
 
				 	 *
			
 
				-	 * As to previously fallocated extents, ext4 get_block will
			
 
				-	 * just simply mark the buffer mapped but still keep the
			
 
				-	 * extents unwritten.
			
 
				+	 * As to previously fallocated extents, ext4 get_block will just simply
			
 
				+	 * mark the buffer mapped but still keep the extents unwritten.
			
 
				 	 *
			
 
				-	 * For non AIO case, we will convert those unwritten extents
			
 
				-	 * to written after return back from blockdev_direct_IO.
			
 
				+	 * For non AIO case, we will convert those unwritten extents to written
			
 
				+	 * after return back from blockdev_direct_IO. That way we save us from
			
 
				+	 * allocating io_end structure and also the overhead of offloading
			
 
				+	 * the extent convertion to a workqueue.
			
 
				 	 *
			
 
				 	 * For async DIO, the conversion needs to be deferred when the
			
 
				 	 * IO is completed. The ext4 end_io callback function will be
			
@@ -3285,30 +3379,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 	 * case, we allocate an io_end structure to hook to the iocb.
			
 
				 	 */
			
 
				 	iocb->private = NULL;
			
 
				-	if (overwrite) {
			
 
				-		get_block_func = ext4_get_block_overwrite;
			
 
				+	if (overwrite)
			
 
				+		get_block_func = ext4_dio_get_block_overwrite;
			
 
				+	else if (is_sync_kiocb(iocb)) {
			
 
				+		get_block_func = ext4_dio_get_block_unwritten_sync;
			
 
				+		dio_flags = DIO_LOCKING;
			
 
				 	} else {
			
 
				-		ext4_inode_aio_set(inode, NULL);
			
 
				-		if (!is_sync_kiocb(iocb)) {
			
 
				-			io_end = ext4_init_io_end(inode, GFP_NOFS);
			
 
				-			if (!io_end) {
			
 
				-				ret = -ENOMEM;
			
 
				-				goto retake_lock;
			
 
				-			}
			
 
				-			/*
			
 
				-			 * Grab reference for DIO. Will be dropped in
			
 
				-			 * ext4_end_io_dio()
			
 
				-			 */
			
 
				-			iocb->private = ext4_get_io_end(io_end);
			
 
				-			/*
			
 
				-			 * we save the io structure for current async direct
			
 
				-			 * IO, so that later ext4_map_blocks() could flag the
			
 
				-			 * io structure whether there is a unwritten extents
			
 
				-			 * needs to be converted when IO is completed.
			
 
				-			 */
			
 
				-			ext4_inode_aio_set(inode, io_end);
			
 
				-		}
			
 
				-		get_block_func = ext4_get_block_write;
			
 
				+		get_block_func = ext4_dio_get_block_unwritten_async;
			
 
				 		dio_flags = DIO_LOCKING;
			
 
				 	}
			
 
				 #ifdef CONFIG_EXT4_FS_ENCRYPTION
			
@@ -3323,27 +3400,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 					   get_block_func,
			
 
				 					   ext4_end_io_dio, NULL, dio_flags);
			
 
				 
			
 
				-	/*
			
 
				-	 * Put our reference to io_end. This can free the io_end structure e.g.
			
 
				-	 * in sync IO case or in case of error. It can even perform extent
			
 
				-	 * conversion if all bios we submitted finished before we got here.
			
 
				-	 * Note that in that case iocb->private can be already set to NULL
			
 
				-	 * here.
			
 
				-	 */
			
 
				-	if (io_end) {
			
 
				-		ext4_inode_aio_set(inode, NULL);
			
 
				-		ext4_put_io_end(io_end);
			
 
				-		/*
			
 
				-		 * When no IO was submitted ext4_end_io_dio() was not
			
 
				-		 * called so we have to put iocb's reference.
			
 
				-		 */
			
 
				-		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
			
 
				-			WARN_ON(iocb->private != io_end);
			
 
				-			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
			
 
				-			ext4_put_io_end(io_end);
			
 
				-			iocb->private = NULL;
			
 
				-		}
			
 
				-	}
			
 
				 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
			
 
				 						EXT4_STATE_DIO_UNWRITTEN)) {
			
 
				 		int err;
			
@@ -3358,7 +3414,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
				 		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
			
 
				 	}
			
 
				 
			
 
				-retake_lock:
			
 
				 	if (iov_iter_rw(iter) == WRITE)
			
 
				 		inode_dio_end(inode);
			
 
				 	/* take i_mutex locking again if we do a ovewrite dio */
			
@@ -5261,6 +5316,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 
				 	might_sleep();
			
 
				 	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
			
 
				 	err = ext4_reserve_inode_write(handle, inode, &iloc);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				 	if (ext4_handle_valid(handle) &&
			
 
				 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
			
 
				 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
			
@@ -5291,9 +5348,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	if (!err)
			
 
				-		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
			
 
				-	return err;
			
 
				+	return ext4_mark_iloc_dirty(handle, inode, &iloc);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -5502,7 +5557,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	unlock_page(page);
			
 
				 	/* OK, we need to fill the hole... */
			
 
				 	if (ext4_should_dioread_nolock(inode))
			
 
				-		get_block = ext4_get_block_write;
			
 
				+		get_block = ext4_get_block_unwritten;
			
 
				 	else
			
 
				 		get_block = ext4_get_block;
			
 
				 retry_alloc:
			
@@ -5545,3 +5600,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 
			
 
				 	return err;
			
 
				 }
			
 
				+
			
 
				+/*
			
 
				+ * Find the first extent at or after @lblk in an inode that is not a hole.
			
 
				+ * Search for @map_len blocks at most. The extent is returned in @result.
			
 
				+ *
			
 
				+ * The function returns 1 if we found an extent. The function returns 0 in
			
 
				+ * case there is no extent at or after @lblk and in that case also sets
			
 
				+ * @result->es_len to 0. In case of error, the error code is returned.
			
 
				+ */
			
 
				+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
			
 
				+			 unsigned int map_len, struct extent_status *result)
			
 
				+{
			
 
				+	struct ext4_map_blocks map;
			
 
				+	struct extent_status es = {};
			
 
				+	int ret;
			
 
				+
			
 
				+	map.m_lblk = lblk;
			
 
				+	map.m_len = map_len;
			
 
				+
			
 
				+	/*
			
 
				+	 * For non-extent based files this loop may iterate several times since
			
 
				+	 * we do not determine full hole size.
			
 
				+	 */
			
 
				+	while (map.m_len > 0) {
			
 
				+		ret = ext4_map_blocks(NULL, inode, &map, 0);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+		/* There's extent covering m_lblk? Just return it. */
			
 
				+		if (ret > 0) {
			
 
				+			int status;
			
 
				+
			
 
				+			ext4_es_store_pblock(result, map.m_pblk);
			
 
				+			result->es_lblk = map.m_lblk;
			
 
				+			result->es_len = map.m_len;
			
 
				+			if (map.m_flags & EXT4_MAP_UNWRITTEN)
			
 
				+				status = EXTENT_STATUS_UNWRITTEN;
			
 
				+			else
			
 
				+				status = EXTENT_STATUS_WRITTEN;
			
 
				+			ext4_es_store_status(result, status);
			
 
				+			return 1;
			
 
				+		}
			
 
				+		ext4_es_find_delayed_extent_range(inode, map.m_lblk,
			
 
				+						  map.m_lblk + map.m_len - 1,
			
 
				+						  &es);
			
 
				+		/* Is delalloc data before next block in extent tree? */
			
 
				+		if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
			
 
				+			ext4_lblk_t offset = 0;
			
 
				+
			
 
				+			if (es.es_lblk < lblk)
			
 
				+				offset = lblk - es.es_lblk;
			
 
				+			result->es_lblk = es.es_lblk + offset;
			
 
				+			ext4_es_store_pblock(result,
			
 
				+					     ext4_es_pblock(&es) + offset);
			
 
				+			result->es_len = es.es_len - offset;
			
 
				+			ext4_es_store_status(result, ext4_es_status(&es));
			
 
				+
			
 
				+			return 1;
			
 
				+		}
			
 
				+		/* There's a hole at m_lblk, advance us after it */
			
 
				+		map.m_lblk += map.m_len;
			
 
				+		map_len -= map.m_len;
			
 
				+		map.m_len = map_len;
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+	result->es_len = 0;
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
 
				  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				  * GNU General Public License for more details.
			
 
				  *
			
 
				- * You should have received a copy of the GNU General Public Licens
			
 
				+ * You should have received a copy of the GNU General Public License
			
 
				  * along with this program; if not, write to the Free Software
			
 
				  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
			
 
				  */
			
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
 
				  * for this page; do not hold this lock when calling this routine!
			
 
				  */
			
 
				 
			
 
				-static int ext4_mb_init_cache(struct page *page, char *incore)
			
 
				+static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
			
 
				 {
			
 
				 	ext4_group_t ngroups;
			
 
				 	int blocksize;
			
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
				 	/* allocate buffer_heads to read bitmaps */
			
 
				 	if (groups_per_page > 1) {
			
 
				 		i = sizeof(struct buffer_head *) * groups_per_page;
			
 
				-		bh = kzalloc(i, GFP_NOFS);
			
 
				+		bh = kzalloc(i, gfp);
			
 
				 		if (bh == NULL) {
			
 
				 			err = -ENOMEM;
			
 
				 			goto out;
			
@@ -983,7 +983,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
				  * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
			
 
				  */
			
 
				 static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
			
 
				-		ext4_group_t group, struct ext4_buddy *e4b)
			
 
				+		ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
			
 
				 {
			
 
				 	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
			
 
				 	int block, pnum, poff;
			
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 
				 	block = group * 2;
			
 
				 	pnum = block / blocks_per_page;
			
 
				 	poff = block % blocks_per_page;
			
 
				-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
			
 
				+	page = find_or_create_page(inode->i_mapping, pnum, gfp);
			
 
				 	if (!page)
			
 
				 		return -ENOMEM;
			
 
				 	BUG_ON(page->mapping != inode->i_mapping);
			
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 
				 
			
 
				 	block++;
			
 
				 	pnum = block / blocks_per_page;
			
 
				-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
			
 
				+	page = find_or_create_page(inode->i_mapping, pnum, gfp);
			
 
				 	if (!page)
			
 
				 		return -ENOMEM;
			
 
				 	BUG_ON(page->mapping != inode->i_mapping);
			
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
 
				  * calling this routine!
			
 
				  */
			
 
				 static noinline_for_stack
			
 
				-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
			
 
				+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
			
 
				 {
			
 
				 
			
 
				 	struct ext4_group_info *this_grp;
			
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 
				 	 * The call to ext4_mb_get_buddy_page_lock will mark the
			
 
				 	 * page accessed.
			
 
				 	 */
			
 
				-	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
			
 
				+	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
			
 
				 	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
			
 
				 		/*
			
 
				 		 * somebody initialized the group
			
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 
				 	}
			
 
				 
			
 
				 	page = e4b.bd_bitmap_page;
			
 
				-	ret = ext4_mb_init_cache(page, NULL);
			
 
				+	ret = ext4_mb_init_cache(page, NULL, gfp);
			
 
				 	if (ret)
			
 
				 		goto err;
			
 
				 	if (!PageUptodate(page)) {
			
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 
				 	}
			
 
				 	/* init buddy cache */
			
 
				 	page = e4b.bd_buddy_page;
			
 
				-	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
			
 
				+	ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
			
 
				 	if (ret)
			
 
				 		goto err;
			
 
				 	if (!PageUptodate(page)) {
			
@@ -1109,8 +1109,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 
				  * calling this routine!
			
 
				  */
			
 
				 static noinline_for_stack int
			
 
				-ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
			
 
				-					struct ext4_buddy *e4b)
			
 
				+ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
			
 
				+		       struct ext4_buddy *e4b, gfp_t gfp)
			
 
				 {
			
 
				 	int blocks_per_page;
			
 
				 	int block;
			
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 
				 		 * we need full data about the group
			
 
				 		 * to make a good selection
			
 
				 		 */
			
 
				-		ret = ext4_mb_init_group(sb, group);
			
 
				+		ret = ext4_mb_init_group(sb, group, gfp);
			
 
				 		if (ret)
			
 
				 			return ret;
			
 
				 	}
			
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 
				 			 * wait for it to initialize.
			
 
				 			 */
			
 
				 			page_cache_release(page);
			
 
				-		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
			
 
				+		page = find_or_create_page(inode->i_mapping, pnum, gfp);
			
 
				 		if (page) {
			
 
				 			BUG_ON(page->mapping != inode->i_mapping);
			
 
				 			if (!PageUptodate(page)) {
			
 
				-				ret = ext4_mb_init_cache(page, NULL);
			
 
				+				ret = ext4_mb_init_cache(page, NULL, gfp);
			
 
				 				if (ret) {
			
 
				 					unlock_page(page);
			
 
				 					goto err;
			
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 
				 	if (page == NULL || !PageUptodate(page)) {
			
 
				 		if (page)
			
 
				 			page_cache_release(page);
			
 
				-		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
			
 
				+		page = find_or_create_page(inode->i_mapping, pnum, gfp);
			
 
				 		if (page) {
			
 
				 			BUG_ON(page->mapping != inode->i_mapping);
			
 
				 			if (!PageUptodate(page)) {
			
 
				-				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
			
 
				+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
			
 
				+							 gfp);
			
 
				 				if (ret) {
			
 
				 					unlock_page(page);
			
 
				 					goto err;
			
@@ -1247,6 +1248,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
			
 
				+			      struct ext4_buddy *e4b)
			
 
				+{
			
 
				+	return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
			
 
				+}
			
 
				+
			
 
				 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
			
 
				 {
			
 
				 	if (e4b->bd_bitmap_page)
			
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 
				 
			
 
				 	/* We only do this if the grp has never been initialized */
			
 
				 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
			
 
				-		int ret = ext4_mb_init_group(ac->ac_sb, group);
			
 
				+		int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
			
 
				 		if (ret)
			
 
				 			return ret;
			
 
				 	}
			
@@ -4694,16 +4701,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
				 			    inode, bh, block);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * We need to make sure we don't reuse the freed block until
			
 
				-	 * after the transaction is committed, which we can do by
			
 
				-	 * treating the block as metadata, below.  We make an
			
 
				-	 * exception if the inode is to be written in writeback mode
			
 
				-	 * since writeback mode has weak data consistency guarantees.
			
 
				-	 */
			
 
				-	if (!ext4_should_writeback_data(inode))
			
 
				-		flags |= EXT4_FREE_BLOCKS_METADATA;
			
 
				-
			
 
				 	/*
			
 
				 	 * If the extent to be freed does not begin on a cluster
			
 
				 	 * boundary, we need to deal with partial clusters at the
			
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
				 
			
 
				 	if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
			
 
				 		int i;
			
 
				+		int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
			
 
				 
			
 
				 		for (i = 0; i < count; i++) {
			
 
				 			cond_resched();
			
 
				-			bh = sb_find_get_block(inode->i_sb, block + i);
			
 
				-			if (!bh)
			
 
				-				continue;
			
 
				-			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
			
 
				-				    inode, bh, block + i);
			
 
				+			if (is_metadata)
			
 
				+				bh = sb_find_get_block(inode->i_sb, block + i);
			
 
				+			ext4_forget(handle, is_metadata, inode, bh, block + i);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -4815,16 +4811,23 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
				 #endif
			
 
				 	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
			
 
				 
			
 
				-	err = ext4_mb_load_buddy(sb, block_group, &e4b);
			
 
				+	/* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
			
 
				+	err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
			
 
				+				     GFP_NOFS|__GFP_NOFAIL);
			
 
				 	if (err)
			
 
				 		goto error_return;
			
 
				 
			
 
				-	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
			
 
				+	/*
			
 
				+	 * We need to make sure we don't reuse the freed block until after the
			
 
				+	 * transaction is committed. We make an exception if the inode is to be
			
 
				+	 * written in writeback mode since writeback mode has weak data
			
 
				+	 * consistency guarantees.
			
 
				+	 */
			
 
				+	if (ext4_handle_valid(handle) &&
			
 
				+	    ((flags & EXT4_FREE_BLOCKS_METADATA) ||
			
 
				+	     !ext4_should_writeback_data(inode))) {
			
 
				 		struct ext4_free_data *new_entry;
			
 
				 		/*
			
 
				-		 * blocks being freed are metadata. these blocks shouldn't
			
 
				-		 * be used until this transaction is committed
			
 
				-		 *
			
 
				 		 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
			
 
				 		 * to fail.
			
 
				 		 */
			
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
				 		grp = ext4_get_group_info(sb, group);
			
 
				 		/* We only do this if the grp has never been initialized */
			
 
				 		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
			
 
				-			ret = ext4_mb_init_group(sb, group);
			
 
				+			ret = ext4_mb_init_group(sb, group, GFP_NOFS);
			
 
				 			if (ret)
			
 
				 				break;
			
 
				 		}
			
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -22,18 +22,6 @@
 
				 #include "ext4_jbd2.h"
			
 
				 #include "ext4.h"
			
 
				 
			
 
				-/*
			
 
				- * with AGGRESSIVE_CHECK allocator runs consistency checks over
			
 
				- * structures. these checks slow things down a lot
			
 
				- */
			
 
				-#define AGGRESSIVE_CHECK__
			
 
				-
			
 
				-/*
			
 
				- * with DOUBLE_CHECK defined mballoc creates persistent in-core
			
 
				- * bitmaps, maintains and uses them to check for double allocations
			
 
				- */
			
 
				-#define DOUBLE_CHECK__
			
 
				-
			
 
				 /*
			
 
				  */
			
 
				 #ifdef CONFIG_EXT4_DEBUG
			
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 
				 	 * blocks.
			
 
				 	 *
			
 
				 	 * While converting to extents we need not
			
 
				-	 * update the orignal inode i_blocks for extent blocks
			
 
				+	 * update the original inode i_blocks for extent blocks
			
 
				 	 * via quota APIs. The quota update happened via tmp_inode already.
			
 
				 	 */
			
 
				 	spin_lock(&inode->i_lock);
			
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 
				 	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
			
 
				 	wait_on_buffer(*bh);
			
 
				 	if (!buffer_uptodate(*bh)) {
			
 
				-		brelse(*bh);
			
 
				-		*bh = NULL;
			
 
				 		ret = -EIO;
			
 
				 		goto warn_exit;
			
 
				 	}
			
 
				-
			
 
				 	mmp = (struct mmp_struct *)((*bh)->b_data);
			
 
				-	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
			
 
				+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
			
 
				 		ret = -EFSCORRUPTED;
			
 
				-	else if (!ext4_mmp_csum_verify(sb, mmp))
			
 
				+		goto warn_exit;
			
 
				+	}
			
 
				+	if (!ext4_mmp_csum_verify(sb, mmp)) {
			
 
				 		ret = -EFSBADCRC;
			
 
				-	else
			
 
				-		return 0;
			
 
				-
			
 
				+		goto warn_exit;
			
 
				+	}
			
 
				+	return 0;
			
 
				 warn_exit:
			
 
				+	brelse(*bh);
			
 
				+	*bh = NULL;
			
 
				 	ext4_warning(sb, "Error %d while reading MMP block %llu",
			
 
				 		     ret, mmp_block);
			
 
				 	return ret;
			
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
 
				 		    EXT4_FEATURE_INCOMPAT_MMP)) {
			
 
				 			ext4_warning(sb, "kmmpd being stopped since MMP feature"
			
 
				 				     " has been disabled.");
			
 
				-			EXT4_SB(sb)->s_mmp_tsk = NULL;
			
 
				-			goto failed;
			
 
				+			goto exit_thread;
			
 
				 		}
			
 
				 
			
 
				 		if (sb->s_flags & MS_RDONLY) {
			
 
				 			ext4_warning(sb, "kmmpd being stopped since filesystem "
			
 
				 				     "has been remounted as readonly.");
			
 
				-			EXT4_SB(sb)->s_mmp_tsk = NULL;
			
 
				-			goto failed;
			
 
				+			goto exit_thread;
			
 
				 		}
			
 
				 
			
 
				 		diff = jiffies - last_update_time;
			
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
 
				 			if (retval) {
			
 
				 				ext4_error(sb, "error reading MMP data: %d",
			
 
				 					   retval);
			
 
				-
			
 
				-				EXT4_SB(sb)->s_mmp_tsk = NULL;
			
 
				-				goto failed;
			
 
				+				goto exit_thread;
			
 
				 			}
			
 
				 
			
 
				 			mmp_check = (struct mmp_struct *)(bh_check->b_data);
			
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
 
				 					     "The filesystem seems to have been"
			
 
				 					     " multiply mounted.");
			
 
				 				ext4_error(sb, "abort");
			
 
				-				goto failed;
			
 
				+				put_bh(bh_check);
			
 
				+				retval = -EBUSY;
			
 
				+				goto exit_thread;
			
 
				 			}
			
 
				 			put_bh(bh_check);
			
 
				 		}
			
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
 
				 
			
 
				 	retval = write_mmp_block(sb, bh);
			
 
				 
			
 
				-failed:
			
 
				+exit_thread:
			
 
				+	EXT4_SB(sb)->s_mmp_tsk = NULL;
			
 
				 	kfree(data);
			
 
				 	brelse(bh);
			
 
				 	return retval;
			
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
 
				 	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
			
 
				 	WARN_ON(io_end->handle);
			
 
				 
			
 
				-	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
			
 
				-		wake_up_all(ext4_ioend_wq(io_end->inode));
			
 
				-
			
 
				 	for (bio = io_end->bio; bio; bio = next_bio) {
			
 
				 		next_bio = bio->bi_private;
			
 
				 		ext4_finish_bio(bio);
			
@@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 
				 {
			
 
				 	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
			
 
				 	if (io) {
			
 
				-		atomic_inc(&EXT4_I(inode)->i_ioend_count);
			
 
				 		io->inode = inode;
			
 
				 		INIT_LIST_HEAD(&io->list);
			
 
				 		atomic_set(&io->count, 1);
			
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
 
				 
			
 
				 static struct ext4_lazy_init *ext4_li_info;
			
 
				 static struct mutex ext4_li_mtx;
			
 
				-static int ext4_mballoc_ready;
			
 
				 static struct ratelimit_state ext4_mount_msg_ratelimit;
			
 
				 
			
 
				 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
			
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
 
				 	ext4_release_system_zone(sb);
			
 
				 	ext4_mb_release(sb);
			
 
				 	ext4_ext_release(sb);
			
 
				-	ext4_xattr_put_super(sb);
			
 
				 
			
 
				 	if (!(sb->s_flags & MS_RDONLY)) {
			
 
				 		ext4_clear_feature_journal_needs_recovery(sb);
			
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 
				 	spin_lock_init(&ei->i_completed_io_lock);
			
 
				 	ei->i_sync_tid = 0;
			
 
				 	ei->i_datasync_tid = 0;
			
 
				-	atomic_set(&ei->i_ioend_count, 0);
			
 
				 	atomic_set(&ei->i_unwritten, 0);
			
 
				 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
			
 
				 #ifdef CONFIG_EXT4_FS_ENCRYPTION
			
@@ -1425,9 +1422,9 @@ static const struct mount_opts {
 
				 	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
			
 
				 	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
			
 
				 	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
			
 
				-	 MOPT_NO_EXT2 | MOPT_SET},
			
 
				+	 MOPT_NO_EXT2},
			
 
				 	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
			
 
				-	 MOPT_NO_EXT2 | MOPT_CLEAR},
			
 
				+	 MOPT_NO_EXT2},
			
 
				 	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
			
 
				 	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
			
 
				 	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
			
@@ -1705,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 
				 		ext4_msg(sb, KERN_INFO, "dax option not supported");
			
 
				 		return -1;
			
 
				 #endif
			
 
				+	} else if (token == Opt_data_err_abort) {
			
 
				+		sbi->s_mount_opt |= m->mount_opt;
			
 
				+	} else if (token == Opt_data_err_ignore) {
			
 
				+		sbi->s_mount_opt &= ~m->mount_opt;
			
 
				 	} else {
			
 
				 		if (!args->from)
			
 
				 			arg = 1;
			
@@ -1914,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 
				 		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
			
 
				 	if (nodefs || sbi->s_max_dir_size_kb)
			
 
				 		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
			
 
				+	if (test_opt(sb, DATA_ERR_ABORT))
			
 
				+		SEQ_OPTS_PUTS("data_err=abort");
			
 
				 
			
 
				 	ext4_show_quota_options(seq, sb);
			
 
				 	return 0;
			
@@ -3796,12 +3799,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
				 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
			
 
				 
			
 
				 no_journal:
			
 
				-	if (ext4_mballoc_ready) {
			
 
				-		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
			
 
				-		if (!sbi->s_mb_cache) {
			
 
				-			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
			
 
				-			goto failed_mount_wq;
			
 
				-		}
			
 
				+	sbi->s_mb_cache = ext4_xattr_create_cache();
			
 
				+	if (!sbi->s_mb_cache) {
			
 
				+		ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
			
 
				+		goto failed_mount_wq;
			
 
				 	}
			
 
				 
			
 
				 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
			
@@ -4027,6 +4028,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
				 	if (EXT4_SB(sb)->rsv_conversion_wq)
			
 
				 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
			
 
				 failed_mount_wq:
			
 
				+	if (sbi->s_mb_cache) {
			
 
				+		ext4_xattr_destroy_cache(sbi->s_mb_cache);
			
 
				+		sbi->s_mb_cache = NULL;
			
 
				+	}
			
 
				 	if (sbi->s_journal) {
			
 
				 		jbd2_journal_destroy(sbi->s_journal);
			
 
				 		sbi->s_journal = NULL;
			
@@ -5321,7 +5326,6 @@ MODULE_ALIAS_FS("ext4");
 
				 
			
 
				 /* Shared across all ext4 file systems */
			
 
				 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
			
 
				-struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
			
 
				 
			
 
				 static int __init ext4_init_fs(void)
			
 
				 {
			
@@ -5334,10 +5338,8 @@ static int __init ext4_init_fs(void)
 
				 	/* Build-time check for flags consistency */
			
 
				 	ext4_check_flag_values();
			
 
				 
			
 
				-	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
			
 
				-		mutex_init(&ext4__aio_mutex[i]);
			
 
				+	for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
			
 
				 		init_waitqueue_head(&ext4__ioend_wq[i]);
			
 
				-	}
			
 
				 
			
 
				 	err = ext4_init_es();
			
 
				 	if (err)
			
@@ -5358,8 +5360,6 @@ static int __init ext4_init_fs(void)
 
				 	err = ext4_init_mballoc();
			
 
				 	if (err)
			
 
				 		goto out2;
			
 
				-	else
			
 
				-		ext4_mballoc_ready = 1;
			
 
				 	err = init_inodecache();
			
 
				 	if (err)
			
 
				 		goto out1;
			
@@ -5375,7 +5375,6 @@ static int __init ext4_init_fs(void)
 
				 	unregister_as_ext3();
			
 
				 	destroy_inodecache();
			
 
				 out1:
			
 
				-	ext4_mballoc_ready = 0;
			
 
				 	ext4_exit_mballoc();
			
 
				 out2:
			
 
				 	ext4_exit_sysfs();
			
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -545,30 +545,44 @@ static void
 
				 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
			
 
				 			 struct buffer_head *bh)
			
 
				 {
			
 
				-	struct mb_cache_entry *ce = NULL;
			
 
				-	int error = 0;
			
 
				 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
			
 
				+	u32 hash, ref;
			
 
				+	int error = 0;
			
 
				 
			
 
				-	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
			
 
				 	BUFFER_TRACE(bh, "get_write_access");
			
 
				 	error = ext4_journal_get_write_access(handle, bh);
			
 
				 	if (error)
			
 
				 		goto out;
			
 
				 
			
 
				 	lock_buffer(bh);
			
 
				-	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
			
 
				+	hash = le32_to_cpu(BHDR(bh)->h_hash);
			
 
				+	ref = le32_to_cpu(BHDR(bh)->h_refcount);
			
 
				+	if (ref == 1) {
			
 
				 		ea_bdebug(bh, "refcount now=0; freeing");
			
 
				-		if (ce)
			
 
				-			mb_cache_entry_free(ce);
			
 
				+		/*
			
 
				+		 * This must happen under buffer lock for
			
 
				+		 * ext4_xattr_block_set() to reliably detect freed block
			
 
				+		 */
			
 
				+		mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
			
 
				 		get_bh(bh);
			
 
				 		unlock_buffer(bh);
			
 
				 		ext4_free_blocks(handle, inode, bh, 0, 1,
			
 
				 				 EXT4_FREE_BLOCKS_METADATA |
			
 
				 				 EXT4_FREE_BLOCKS_FORGET);
			
 
				 	} else {
			
 
				-		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
			
 
				-		if (ce)
			
 
				-			mb_cache_entry_release(ce);
			
 
				+		ref--;
			
 
				+		BHDR(bh)->h_refcount = cpu_to_le32(ref);
			
 
				+		if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
			
 
				+			struct mb_cache_entry *ce;
			
 
				+
			
 
				+			ce = mb_cache_entry_get(ext4_mb_cache, hash,
			
 
				+						bh->b_blocknr);
			
 
				+			if (ce) {
			
 
				+				ce->e_reusable = 1;
			
 
				+				mb_cache_entry_put(ext4_mb_cache, ce);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				 		/*
			
 
				 		 * Beware of this ugliness: Releasing of xattr block references
			
 
				 		 * from different inodes can race and so we have to protect
			
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 	if (i->value && i->value_len > sb->s_blocksize)
			
 
				 		return -ENOSPC;
			
 
				 	if (s->base) {
			
 
				-		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
			
 
				-					bs->bh->b_blocknr);
			
 
				 		BUFFER_TRACE(bs->bh, "get_write_access");
			
 
				 		error = ext4_journal_get_write_access(handle, bs->bh);
			
 
				 		if (error)
			
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 		lock_buffer(bs->bh);
			
 
				 
			
 
				 		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
			
 
				-			if (ce) {
			
 
				-				mb_cache_entry_free(ce);
			
 
				-				ce = NULL;
			
 
				-			}
			
 
				+			__u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
			
 
				+
			
 
				+			/*
			
 
				+			 * This must happen under buffer lock for
			
 
				+			 * ext4_xattr_block_set() to reliably detect modified
			
 
				+			 * block
			
 
				+			 */
			
 
				+			mb_cache_entry_delete_block(ext4_mb_cache, hash,
			
 
				+						    bs->bh->b_blocknr);
			
 
				 			ea_bdebug(bs->bh, "modifying in-place");
			
 
				 			error = ext4_xattr_set_entry(i, s);
			
 
				 			if (!error) {
			
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 			int offset = (char *)s->here - bs->bh->b_data;
			
 
				 
			
 
				 			unlock_buffer(bs->bh);
			
 
				-			if (ce) {
			
 
				-				mb_cache_entry_release(ce);
			
 
				-				ce = NULL;
			
 
				-			}
			
 
				 			ea_bdebug(bs->bh, "cloning");
			
 
				 			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
			
 
				 			error = -ENOMEM;
			
@@ -872,6 +885,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 			if (new_bh == bs->bh)
			
 
				 				ea_bdebug(new_bh, "keeping");
			
 
				 			else {
			
 
				+				u32 ref;
			
 
				+
			
 
				 				/* The old block is released after updating
			
 
				 				   the inode. */
			
 
				 				error = dquot_alloc_block(inode,
			
@@ -884,9 +899,40 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 				if (error)
			
 
				 					goto cleanup_dquot;
			
 
				 				lock_buffer(new_bh);
			
 
				-				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
			
 
				+				/*
			
 
				+				 * We have to be careful about races with
			
 
				+				 * freeing, rehashing or adding references to
			
 
				+				 * xattr block. Once we hold buffer lock xattr
			
 
				+				 * block's state is stable so we can check
			
 
				+				 * whether the block got freed / rehashed or
			
 
				+				 * not.  Since we unhash mbcache entry under
			
 
				+				 * buffer lock when freeing / rehashing xattr
			
 
				+				 * block, checking whether entry is still
			
 
				+				 * hashed is reliable. Same rules hold for
			
 
				+				 * e_reusable handling.
			
 
				+				 */
			
 
				+				if (hlist_bl_unhashed(&ce->e_hash_list) ||
			
 
				+				    !ce->e_reusable) {
			
 
				+					/*
			
 
				+					 * Undo everything and check mbcache
			
 
				+					 * again.
			
 
				+					 */
			
 
				+					unlock_buffer(new_bh);
			
 
				+					dquot_free_block(inode,
			
 
				+							 EXT4_C2B(EXT4_SB(sb),
			
 
				+								  1));
			
 
				+					brelse(new_bh);
			
 
				+					mb_cache_entry_put(ext4_mb_cache, ce);
			
 
				+					ce = NULL;
			
 
				+					new_bh = NULL;
			
 
				+					goto inserted;
			
 
				+				}
			
 
				+				ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
			
 
				+				BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
			
 
				+				if (ref >= EXT4_XATTR_REFCOUNT_MAX)
			
 
				+					ce->e_reusable = 0;
			
 
				 				ea_bdebug(new_bh, "reusing; refcount now=%d",
			
 
				-					le32_to_cpu(BHDR(new_bh)->h_refcount));
			
 
				+					  ref);
			
 
				 				unlock_buffer(new_bh);
			
 
				 				error = ext4_handle_dirty_xattr_block(handle,
			
 
				 								      inode,
			
@@ -894,7 +940,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 				if (error)
			
 
				 					goto cleanup_dquot;
			
 
				 			}
			
 
				-			mb_cache_entry_release(ce);
			
 
				+			mb_cache_entry_touch(ext4_mb_cache, ce);
			
 
				+			mb_cache_entry_put(ext4_mb_cache, ce);
			
 
				 			ce = NULL;
			
 
				 		} else if (bs->bh && s->base == bs->bh->b_data) {
			
 
				 			/* We were modifying this block in-place. */
			
@@ -959,7 +1006,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
				 
			
 
				 cleanup:
			
 
				 	if (ce)
			
 
				-		mb_cache_entry_release(ce);
			
 
				+		mb_cache_entry_put(ext4_mb_cache, ce);
			
 
				 	brelse(new_bh);
			
 
				 	if (!(bs->bh && s->base == bs->bh->b_data))
			
 
				 		kfree(s->base);
			
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int ext4_xattr_value_same(struct ext4_xattr_search *s,
			
 
				+				 struct ext4_xattr_info *i)
			
 
				+{
			
 
				+	void *value;
			
 
				+
			
 
				+	if (le32_to_cpu(s->here->e_value_size) != i->value_len)
			
 
				+		return 0;
			
 
				+	value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
			
 
				+	return !memcmp(value, i->value, i->value_len);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * ext4_xattr_set_handle()
			
 
				  *
			
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 
				 		else if (!bs.s.not_found)
			
 
				 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
			
 
				 	} else {
			
 
				+		error = 0;
			
 
				+		/* Xattr value did not change? Save us some work and bail out */
			
 
				+		if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
			
 
				+			goto cleanup;
			
 
				+		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
			
 
				+			goto cleanup;
			
 
				+
			
 
				 		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
			
 
				 		if (!error && !bs.s.not_found) {
			
 
				 			i.value = NULL;
			
@@ -1511,17 +1576,6 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 
				 	brelse(bh);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * ext4_xattr_put_super()
			
 
				- *
			
 
				- * This is called when a file system is unmounted.
			
 
				- */
			
 
				-void
			
 
				-ext4_xattr_put_super(struct super_block *sb)
			
 
				-{
			
 
				-	mb_cache_shrink(sb->s_bdev);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * ext4_xattr_cache_insert()
			
 
				  *
			
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb)
 
				 static void
			
 
				 ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
			
 
				 {
			
 
				-	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
			
 
				-	struct mb_cache_entry *ce;
			
 
				+	struct ext4_xattr_header *header = BHDR(bh);
			
 
				+	__u32 hash = le32_to_cpu(header->h_hash);
			
 
				+	int reusable = le32_to_cpu(header->h_refcount) <
			
 
				+		       EXT4_XATTR_REFCOUNT_MAX;
			
 
				 	int error;
			
 
				 
			
 
				-	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
			
 
				-	if (!ce) {
			
 
				-		ea_bdebug(bh, "out of memory");
			
 
				-		return;
			
 
				-	}
			
 
				-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
			
 
				+	error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
			
 
				+				      bh->b_blocknr, reusable);
			
 
				 	if (error) {
			
 
				-		mb_cache_entry_free(ce);
			
 
				-		if (error == -EBUSY) {
			
 
				+		if (error == -EBUSY)
			
 
				 			ea_bdebug(bh, "already in cache");
			
 
				-			error = 0;
			
 
				-		}
			
 
				-	} else {
			
 
				+	} else
			
 
				 		ea_bdebug(bh, "inserting [%x]", (int)hash);
			
 
				-		mb_cache_entry_release(ce);
			
 
				-	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 
				 	if (!header->h_hash)
			
 
				 		return NULL;  /* never share */
			
 
				 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
			
 
				-again:
			
 
				-	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
			
 
				-				       hash);
			
 
				+	ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
			
 
				 	while (ce) {
			
 
				 		struct buffer_head *bh;
			
 
				 
			
 
				-		if (IS_ERR(ce)) {
			
 
				-			if (PTR_ERR(ce) == -EAGAIN)
			
 
				-				goto again;
			
 
				-			break;
			
 
				-		}
			
 
				 		bh = sb_bread(inode->i_sb, ce->e_block);
			
 
				 		if (!bh) {
			
 
				 			EXT4_ERROR_INODE(inode, "block %lu read error",
			
 
				 					 (unsigned long) ce->e_block);
			
 
				-		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
			
 
				-				EXT4_XATTR_REFCOUNT_MAX) {
			
 
				-			ea_idebug(inode, "block %lu refcount %d>=%d",
			
 
				-				  (unsigned long) ce->e_block,
			
 
				-				  le32_to_cpu(BHDR(bh)->h_refcount),
			
 
				-					  EXT4_XATTR_REFCOUNT_MAX);
			
 
				 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
			
 
				 			*pce = ce;
			
 
				 			return bh;
			
 
				 		}
			
 
				 		brelse(bh);
			
 
				-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
			
 
				+		ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
			
 
				 	}
			
 
				 	return NULL;
			
 
				 }
			
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 
				 #define	HASH_BUCKET_BITS	10
			
 
				 
			
 
				 struct mb_cache *
			
 
				-ext4_xattr_create_cache(char *name)
			
 
				+ext4_xattr_create_cache(void)
			
 
				 {
			
 
				-	return mb_cache_create(name, HASH_BUCKET_BITS);
			
 
				+	return mb_cache_create(HASH_BUCKET_BITS);
			
 
				 }
			
 
				 
			
 
				 void ext4_xattr_destroy_cache(struct mb_cache *cache)
			
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
 
				 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
			
 
				 
			
 
				 extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
			
 
				-extern void ext4_xattr_put_super(struct super_block *);
			
 
				 
			
 
				 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
			
 
				 			    struct ext4_inode *raw_inode, handle_t *handle);
			
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 
				 				       struct ext4_xattr_info *i,
			
 
				 				       struct ext4_xattr_ibody_find *is);
			
 
				 
			
 
				-extern struct mb_cache *ext4_xattr_create_cache(char *name);
			
 
				+extern struct mb_cache *ext4_xattr_create_cache(void);
			
 
				 extern void ext4_xattr_destroy_cache(struct mb_cache *);
			
 
				 
			
 
				 #ifdef CONFIG_EXT4_FS_SECURITY
			
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal,
 
				 	if (is_journal_aborted(journal))
			
 
				 		return 0;
			
 
				 
			
 
				-	bh = jbd2_journal_get_descriptor_buffer(journal);
			
 
				+	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
			
 
				+						JBD2_COMMIT_BLOCK);
			
 
				 	if (!bh)
			
 
				 		return 1;
			
 
				 
			
 
				 	tmp = (struct commit_header *)bh->b_data;
			
 
				-	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
			
 
				-	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
			
 
				-	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
			
 
				 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
			
 
				 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
			
 
				 
			
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 
				 	spin_lock(&journal->j_list_lock);
			
 
				 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
			
 
				 		mapping = jinode->i_vfs_inode->i_mapping;
			
 
				-		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
			
 
				+		jinode->i_flags |= JI_COMMIT_RUNNING;
			
 
				 		spin_unlock(&journal->j_list_lock);
			
 
				 		/*
			
 
				 		 * submit the inode data buffers. We use writepage
			
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 
				 			ret = err;
			
 
				 		spin_lock(&journal->j_list_lock);
			
 
				 		J_ASSERT(jinode->i_transaction == commit_transaction);
			
 
				-		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
			
 
				-		smp_mb__after_atomic();
			
 
				+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
			
 
				+		smp_mb();
			
 
				 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
			
 
				 	}
			
 
				 	spin_unlock(&journal->j_list_lock);
			
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 
				 	/* For locking, see the comment in journal_submit_data_buffers() */
			
 
				 	spin_lock(&journal->j_list_lock);
			
 
				 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
			
 
				-		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
			
 
				+		jinode->i_flags |= JI_COMMIT_RUNNING;
			
 
				 		spin_unlock(&journal->j_list_lock);
			
 
				 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
			
 
				 		if (err) {
			
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 
				 				ret = err;
			
 
				 		}
			
 
				 		spin_lock(&journal->j_list_lock);
			
 
				-		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
			
 
				-		smp_mb__after_atomic();
			
 
				+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
			
 
				+		smp_mb();
			
 
				 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
			
 
				 	}
			
 
				 
			
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 
				 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
			
 
				 }
			
 
				 
			
 
				-static void jbd2_descr_block_csum_set(journal_t *j,
			
 
				-				      struct buffer_head *bh)
			
 
				-{
			
 
				-	struct jbd2_journal_block_tail *tail;
			
 
				-	__u32 csum;
			
 
				-
			
 
				-	if (!jbd2_journal_has_csum_v2or3(j))
			
 
				-		return;
			
 
				-
			
 
				-	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
			
 
				-			sizeof(struct jbd2_journal_block_tail));
			
 
				-	tail->t_checksum = 0;
			
 
				-	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
			
 
				-	tail->t_checksum = cpu_to_be32(csum);
			
 
				-}
			
 
				-
			
 
				 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
			
 
				 				    struct buffer_head *bh, __u32 sequence)
			
 
				 {
			
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
				 	ktime_t start_time;
			
 
				 	u64 commit_time;
			
 
				 	char *tagp = NULL;
			
 
				-	journal_header_t *header;
			
 
				 	journal_block_tag_t *tag = NULL;
			
 
				 	int space_left = 0;
			
 
				 	int first_tag = 0;
			
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
				 		jbd2_journal_abort(journal, err);
			
 
				 
			
 
				 	blk_start_plug(&plug);
			
 
				-	jbd2_journal_write_revoke_records(journal, commit_transaction,
			
 
				-					  &log_bufs, WRITE_SYNC);
			
 
				+	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
			
 
				 
			
 
				 	jbd_debug(3, "JBD2: commit phase 2b\n");
			
 
				 
			
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
				 
			
 
				 			jbd_debug(4, "JBD2: get descriptor\n");
			
 
				 
			
 
				-			descriptor = jbd2_journal_get_descriptor_buffer(journal);
			
 
				+			descriptor = jbd2_journal_get_descriptor_buffer(
			
 
				+							commit_transaction,
			
 
				+							JBD2_DESCRIPTOR_BLOCK);
			
 
				 			if (!descriptor) {
			
 
				 				jbd2_journal_abort(journal, -EIO);
			
 
				 				continue;
			
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
				 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
			
 
				 				(unsigned long long)descriptor->b_blocknr,
			
 
				 				descriptor->b_data);
			
 
				-			header = (journal_header_t *)descriptor->b_data;
			
 
				-			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
			
 
				-			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
			
 
				-			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
			
 
				-
			
 
				 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
			
 
				 			space_left = descriptor->b_size -
			
 
				 						sizeof(journal_header_t);
			
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
				 
			
 
				 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
			
 
				 
			
 
				-			jbd2_descr_block_csum_set(journal, descriptor);
			
 
				+			jbd2_descriptor_block_csum_set(journal, descriptor);
			
 
				 start_journal_io:
			
 
				 			for (i = 0; i < bufs; i++) {
			
 
				 				struct buffer_head *bh = wbuf[i];
			
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 
				  * But we don't bother doing that, so there will be coherency problems with
			
 
				  * mmaps of blockdevs which hold live JBD-controlled filesystems.
			
 
				  */
			
 
				-struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
			
 
				+struct buffer_head *
			
 
				+jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
			
 
				 {
			
 
				+	journal_t *journal = transaction->t_journal;
			
 
				 	struct buffer_head *bh;
			
 
				 	unsigned long long blocknr;
			
 
				+	journal_header_t *header;
			
 
				 	int err;
			
 
				 
			
 
				 	err = jbd2_journal_next_log_block(journal, &blocknr);
			
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 
				 		return NULL;
			
 
				 	lock_buffer(bh);
			
 
				 	memset(bh->b_data, 0, journal->j_blocksize);
			
 
				+	header = (journal_header_t *)bh->b_data;
			
 
				+	header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
			
 
				+	header->h_blocktype = cpu_to_be32(type);
			
 
				+	header->h_sequence = cpu_to_be32(transaction->t_tid);
			
 
				 	set_buffer_uptodate(bh);
			
 
				 	unlock_buffer(bh);
			
 
				 	BUFFER_TRACE(bh, "return this buffer");
			
 
				 	return bh;
			
 
				 }
			
 
				 
			
 
				+void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
			
 
				+{
			
 
				+	struct jbd2_journal_block_tail *tail;
			
 
				+	__u32 csum;
			
 
				+
			
 
				+	if (!jbd2_journal_has_csum_v2or3(j))
			
 
				+		return;
			
 
				+
			
 
				+	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
			
 
				+			sizeof(struct jbd2_journal_block_tail));
			
 
				+	tail->t_checksum = 0;
			
 
				+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
			
 
				+	tail->t_checksum = cpu_to_be32(csum);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Return tid of the oldest transaction in the journal and block in the journal
			
 
				  * where the transaction starts.
			
@@ -1408,11 +1430,12 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 
				 /**
			
 
				  * jbd2_mark_journal_empty() - Mark on disk journal as empty.
			
 
				  * @journal: The journal to update.
			
 
				+ * @write_op: With which operation should we write the journal sb
			
 
				  *
			
 
				  * Update a journal's dynamic superblock fields to show that journal is empty.
			
 
				  * Write updated superblock to disk waiting for IO to complete.
			
 
				  */
			
 
				-static void jbd2_mark_journal_empty(journal_t *journal)
			
 
				+static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
			
 
				 {
			
 
				 	journal_superblock_t *sb = journal->j_superblock;
			
 
				 
			
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
 
				 	sb->s_start    = cpu_to_be32(0);
			
 
				 	read_unlock(&journal->j_state_lock);
			
 
				 
			
 
				-	jbd2_write_superblock(journal, WRITE_FUA);
			
 
				+	jbd2_write_superblock(journal, write_op);
			
 
				 
			
 
				 	/* Log is no longer empty */
			
 
				 	write_lock(&journal->j_state_lock);
			
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal)
 
				 	if (journal->j_sb_buffer) {
			
 
				 		if (!is_journal_aborted(journal)) {
			
 
				 			mutex_lock(&journal->j_checkpoint_mutex);
			
 
				-			jbd2_mark_journal_empty(journal);
			
 
				+
			
 
				+			write_lock(&journal->j_state_lock);
			
 
				+			journal->j_tail_sequence =
			
 
				+				++journal->j_transaction_sequence;
			
 
				+			write_unlock(&journal->j_state_lock);
			
 
				+
			
 
				+			jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
			
 
				 			mutex_unlock(&journal->j_checkpoint_mutex);
			
 
				 		} else
			
 
				 			err = -EIO;
			
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal)
 
				 	 * the magic code for a fully-recovered superblock.  Any future
			
 
				 	 * commits of data to the journal will restore the current
			
 
				 	 * s_start value. */
			
 
				-	jbd2_mark_journal_empty(journal);
			
 
				+	jbd2_mark_journal_empty(journal, WRITE_FUA);
			
 
				 	mutex_unlock(&journal->j_checkpoint_mutex);
			
 
				 	write_lock(&journal->j_state_lock);
			
 
				 	J_ASSERT(!journal->j_running_transaction);
			
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 
				 	if (write) {
			
 
				 		/* Lock to make assertions happy... */
			
 
				 		mutex_lock(&journal->j_checkpoint_mutex);
			
 
				-		jbd2_mark_journal_empty(journal);
			
 
				+		jbd2_mark_journal_empty(journal, WRITE_FUA);
			
 
				 		mutex_unlock(&journal->j_checkpoint_mutex);
			
 
				 	}
			
 
				 
			
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 
				 restart:
			
 
				 	spin_lock(&journal->j_list_lock);
			
 
				 	/* Is commit writing out inode - we have to wait */
			
 
				-	if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
			
 
				+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
			
 
				 		wait_queue_head_t *wq;
			
 
				 		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
			
 
				 		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
			
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int jbd2_descr_block_csum_verify(journal_t *j,
			
 
				-					void *buf)
			
 
				+static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
			
 
				 {
			
 
				 	struct jbd2_journal_block_tail *tail;
			
 
				 	__be32 provided;
			
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal,
 
				 				descr_csum_size =
			
 
				 					sizeof(struct jbd2_journal_block_tail);
			
 
				 			if (descr_csum_size > 0 &&
			
 
				-			    !jbd2_descr_block_csum_verify(journal,
			
 
				-							  bh->b_data)) {
			
 
				+			    !jbd2_descriptor_block_csum_verify(journal,
			
 
				+							       bh->b_data)) {
			
 
				 				printk(KERN_ERR "JBD2: Invalid checksum "
			
 
				 				       "recovering block %lu in log\n",
			
 
				 				       next_log_block);
			
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal,
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-static int jbd2_revoke_block_csum_verify(journal_t *j,
			
 
				-					 void *buf)
			
 
				-{
			
 
				-	struct jbd2_journal_revoke_tail *tail;
			
 
				-	__be32 provided;
			
 
				-	__u32 calculated;
			
 
				-
			
 
				-	if (!jbd2_journal_has_csum_v2or3(j))
			
 
				-		return 1;
			
 
				-
			
 
				-	tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
			
 
				-			sizeof(struct jbd2_journal_revoke_tail));
			
 
				-	provided = tail->r_checksum;
			
 
				-	tail->r_checksum = 0;
			
 
				-	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
			
 
				-	tail->r_checksum = provided;
			
 
				-
			
 
				-	return provided == cpu_to_be32(calculated);
			
 
				-}
			
 
				-
			
 
				 /* Scan a revoke record, marking all blocks mentioned as revoked. */
			
 
				 
			
 
				 static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
			
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 
				 	offset = sizeof(jbd2_journal_revoke_header_t);
			
 
				 	rcount = be32_to_cpu(header->r_count);
			
 
				 
			
 
				-	if (!jbd2_revoke_block_csum_verify(journal, header))
			
 
				+	if (!jbd2_descriptor_block_csum_verify(journal, header))
			
 
				 		return -EFSBADCRC;
			
 
				 
			
 
				 	if (jbd2_journal_has_csum_v2or3(journal))
			
 
				-		csum_size = sizeof(struct jbd2_journal_revoke_tail);
			
 
				+		csum_size = sizeof(struct jbd2_journal_block_tail);
			
 
				 	if (rcount > journal->j_blocksize - csum_size)
			
 
				 		return -EINVAL;
			
 
				 	max = rcount;
			
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s
 
				 
			
 
				 
			
 
				 #ifdef __KERNEL__
			
 
				-static void write_one_revoke_record(journal_t *, transaction_t *,
			
 
				+static void write_one_revoke_record(transaction_t *,
			
 
				 				    struct list_head *,
			
 
				 				    struct buffer_head **, int *,
			
 
				-				    struct jbd2_revoke_record_s *, int);
			
 
				-static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
			
 
				+				    struct jbd2_revoke_record_s *);
			
 
				+static void flush_descriptor(journal_t *, struct buffer_head *, int);
			
 
				 #endif
			
 
				 
			
 
				 /* Utility functions to maintain the revoke table */
			
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 
				  * Write revoke records to the journal for all entries in the current
			
 
				  * revoke hash, deleting the entries as we go.
			
 
				  */
			
 
				-void jbd2_journal_write_revoke_records(journal_t *journal,
			
 
				-				       transaction_t *transaction,
			
 
				-				       struct list_head *log_bufs,
			
 
				-				       int write_op)
			
 
				+void jbd2_journal_write_revoke_records(transaction_t *transaction,
			
 
				+				       struct list_head *log_bufs)
			
 
				 {
			
 
				+	journal_t *journal = transaction->t_journal;
			
 
				 	struct buffer_head *descriptor;
			
 
				 	struct jbd2_revoke_record_s *record;
			
 
				 	struct jbd2_revoke_table_s *revoke;
			
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 
				 		while (!list_empty(hash_list)) {
			
 
				 			record = (struct jbd2_revoke_record_s *)
			
 
				 				hash_list->next;
			
 
				-			write_one_revoke_record(journal, transaction, log_bufs,
			
 
				-						&descriptor, &offset,
			
 
				-						record, write_op);
			
 
				+			write_one_revoke_record(transaction, log_bufs,
			
 
				+						&descriptor, &offset, record);
			
 
				 			count++;
			
 
				 			list_del(&record->hash);
			
 
				 			kmem_cache_free(jbd2_revoke_record_cache, record);
			
 
				 		}
			
 
				 	}
			
 
				 	if (descriptor)
			
 
				-		flush_descriptor(journal, descriptor, offset, write_op);
			
 
				+		flush_descriptor(journal, descriptor, offset);
			
 
				 	jbd_debug(1, "Wrote %d revoke records\n", count);
			
 
				 }
			
 
				 
			
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 
				  * block if the old one is full or if we have not already created one.
			
 
				  */
			
 
				 
			
 
				-static void write_one_revoke_record(journal_t *journal,
			
 
				-				    transaction_t *transaction,
			
 
				+static void write_one_revoke_record(transaction_t *transaction,
			
 
				 				    struct list_head *log_bufs,
			
 
				 				    struct buffer_head **descriptorp,
			
 
				 				    int *offsetp,
			
 
				-				    struct jbd2_revoke_record_s *record,
			
 
				-				    int write_op)
			
 
				+				    struct jbd2_revoke_record_s *record)
			
 
				 {
			
 
				+	journal_t *journal = transaction->t_journal;
			
 
				 	int csum_size = 0;
			
 
				 	struct buffer_head *descriptor;
			
 
				 	int sz, offset;
			
 
				-	journal_header_t *header;
			
 
				 
			
 
				 	/* If we are already aborting, this all becomes a noop.  We
			
 
				            still need to go round the loop in
			
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal,
 
				 
			
 
				 	/* Do we need to leave space at the end for a checksum? */
			
 
				 	if (jbd2_journal_has_csum_v2or3(journal))
			
 
				-		csum_size = sizeof(struct jbd2_journal_revoke_tail);
			
 
				+		csum_size = sizeof(struct jbd2_journal_block_tail);
			
 
				 
			
 
				 	if (jbd2_has_feature_64bit(journal))
			
 
				 		sz = 8;
			
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal,
 
				 	/* Make sure we have a descriptor with space left for the record */
			
 
				 	if (descriptor) {
			
 
				 		if (offset + sz > journal->j_blocksize - csum_size) {
			
 
				-			flush_descriptor(journal, descriptor, offset, write_op);
			
 
				+			flush_descriptor(journal, descriptor, offset);
			
 
				 			descriptor = NULL;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	if (!descriptor) {
			
 
				-		descriptor = jbd2_journal_get_descriptor_buffer(journal);
			
 
				+		descriptor = jbd2_journal_get_descriptor_buffer(transaction,
			
 
				+							JBD2_REVOKE_BLOCK);
			
 
				 		if (!descriptor)
			
 
				 			return;
			
 
				-		header = (journal_header_t *)descriptor->b_data;
			
 
				-		header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
			
 
				-		header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
			
 
				-		header->h_sequence  = cpu_to_be32(transaction->t_tid);
			
 
				 
			
 
				 		/* Record it so that we can wait for IO completion later */
			
 
				 		BUFFER_TRACE(descriptor, "file in log_bufs");
			
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal,
 
				 	*offsetp = offset;
			
 
				 }
			
 
				 
			
 
				-static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
			
 
				-{
			
 
				-	struct jbd2_journal_revoke_tail *tail;
			
 
				-	__u32 csum;
			
 
				-
			
 
				-	if (!jbd2_journal_has_csum_v2or3(j))
			
 
				-		return;
			
 
				-
			
 
				-	tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
			
 
				-			sizeof(struct jbd2_journal_revoke_tail));
			
 
				-	tail->r_checksum = 0;
			
 
				-	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
			
 
				-	tail->r_checksum = cpu_to_be32(csum);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Flush a revoke descriptor out to the journal.  If we are aborting,
			
 
				  * this is a noop; otherwise we are generating a buffer which needs to
			
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
 
				 
			
 
				 static void flush_descriptor(journal_t *journal,
			
 
				 			     struct buffer_head *descriptor,
			
 
				-			     int offset, int write_op)
			
 
				+			     int offset)
			
 
				 {
			
 
				 	jbd2_journal_revoke_header_t *header;
			
 
				 
			
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal,
 
				 
			
 
				 	header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
			
 
				 	header->r_count = cpu_to_be32(offset);
			
 
				-	jbd2_revoke_csum_set(journal, descriptor);
			
 
				+	jbd2_descriptor_block_csum_set(journal, descriptor);
			
 
				 
			
 
				 	set_buffer_jwrite(descriptor);
			
 
				 	BUFFER_TRACE(descriptor, "write");
			
 
				 	set_buffer_dirty(descriptor);
			
 
				-	write_dirty_buffer(descriptor, write_op);
			
 
				+	write_dirty_buffer(descriptor, WRITE_SYNC);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -966,14 +966,8 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 
				 		if (!frozen_buffer) {
			
 
				 			JBUFFER_TRACE(jh, "allocate memory for buffer");
			
 
				 			jbd_unlock_bh_state(bh);
			
 
				-			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
			
 
				-			if (!frozen_buffer) {
			
 
				-				printk(KERN_ERR "%s: OOM for frozen_buffer\n",
			
 
				-				       __func__);
			
 
				-				JBUFFER_TRACE(jh, "oom!");
			
 
				-				error = -ENOMEM;
			
 
				-				goto out;
			
 
				-			}
			
 
				+			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
			
 
				+						   GFP_NOFS | __GFP_NOFAIL);
			
 
				 			goto repeat;
			
 
				 		}
			
 
				 		jh->b_frozen_data = frozen_buffer;
			
@@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 
				 		goto out;
			
 
				 
			
 
				 repeat:
			
 
				-	if (!jh->b_committed_data) {
			
 
				-		committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
			
 
				-		if (!committed_data) {
			
 
				-			printk(KERN_ERR "%s: No memory for committed data\n",
			
 
				-				__func__);
			
 
				-			err = -ENOMEM;
			
 
				-			goto out;
			
 
				-		}
			
 
				-	}
			
 
				+	if (!jh->b_committed_data)
			
 
				+		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
			
 
				+					    GFP_NOFS|__GFP_NOFAIL);
			
 
				 
			
 
				 	jbd_lock_bh_state(bh);
			
 
				 	if (!jh->b_committed_data) {
			
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
 
				-/*
			
 
				- * linux/fs/mbcache.c
			
 
				- * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Filesystem Meta Information Block Cache (mbcache)
			
 
				- *
			
 
				- * The mbcache caches blocks of block devices that need to be located
			
 
				- * by their device/block number, as well as by other criteria (such
			
 
				- * as the block's contents).
			
 
				- *
			
 
				- * There can only be one cache entry in a cache per device and block number.
			
 
				- * Additional indexes need not be unique in this sense. The number of
			
 
				- * additional indexes (=other criteria) can be hardwired at compile time
			
 
				- * or specified at cache create time.
			
 
				- *
			
 
				- * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
			
 
				- * in the cache. A valid entry is in the main hash tables of the cache,
			
 
				- * and may also be in the lru list. An invalid entry is not in any hashes
			
 
				- * or lists.
			
 
				- *
			
 
				- * A valid cache entry is only in the lru list if no handles refer to it.
			
 
				- * Invalid cache entries will be freed when the last handle to the cache
			
 
				- * entry is released. Entries that cannot be freed immediately are put
			
 
				- * back on the lru list.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Lock descriptions and usage:
			
 
				- *
			
 
				- * Each hash chain of both the block and index hash tables now contains
			
 
				- * a built-in lock used to serialize accesses to the hash chain.
			
 
				- *
			
 
				- * Accesses to global data structures mb_cache_list and mb_cache_lru_list
			
 
				- * are serialized via the global spinlock mb_cache_spinlock.
			
 
				- *
			
 
				- * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
			
 
				- * accesses to its local data, such as e_used and e_queued.
			
 
				- *
			
 
				- * Lock ordering:
			
 
				- *
			
 
				- * Each block hash chain's lock has the highest lock order, followed by an
			
 
				- * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
			
 
				- * lock), and mb_cach_spinlock, with the lowest order.  While holding
			
 
				- * either a block or index hash chain lock, a thread can acquire an
			
 
				- * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
			
 
				- *
			
 
				- * Synchronization:
			
 
				- *
			
 
				- * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
			
 
				- * index hash chian, it needs to lock the corresponding hash chain.  For each
			
 
				- * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
			
 
				- * prevent either any simultaneous release or free on the entry and also
			
 
				- * to serialize accesses to either the e_used or e_queued member of the entry.
			
 
				- *
			
 
				- * To avoid having a dangling reference to an already freed
			
 
				- * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
			
 
				- * block hash chain and also no longer being referenced, both e_used,
			
 
				- * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
			
 
				- * first removed from a block hash chain.
			
 
				- */
			
 
				-
			
 
				-#include <linux/kernel.h>
			
 
				-#include <linux/module.h>
			
 
				-
			
 
				-#include <linux/hash.h>
			
 
				-#include <linux/fs.h>
			
 
				-#include <linux/mm.h>
			
 
				+#include <linux/spinlock.h>
			
 
				 #include <linux/slab.h>
			
 
				-#include <linux/sched.h>
			
 
				+#include <linux/list.h>
			
 
				 #include <linux/list_bl.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/workqueue.h>
			
 
				 #include <linux/mbcache.h>
			
 
				-#include <linux/init.h>
			
 
				-#include <linux/blockgroup_lock.h>
			
 
				-#include <linux/log2.h>
			
 
				-
			
 
				-#ifdef MB_CACHE_DEBUG
			
 
				-# define mb_debug(f...) do { \
			
 
				-		printk(KERN_DEBUG f); \
			
 
				-		printk("\n"); \
			
 
				-	} while (0)
			
 
				-#define mb_assert(c) do { if (!(c)) \
			
 
				-		printk(KERN_ERR "assertion " #c " failed\n"); \
			
 
				-	} while(0)
			
 
				-#else
			
 
				-# define mb_debug(f...) do { } while(0)
			
 
				-# define mb_assert(c) do { } while(0)
			
 
				-#endif
			
 
				-#define mb_error(f...) do { \
			
 
				-		printk(KERN_ERR f); \
			
 
				-		printk("\n"); \
			
 
				-	} while(0)
			
 
				-
			
 
				-#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
			
 
				-
			
 
				-#define MB_CACHE_ENTRY_LOCK_BITS	ilog2(NR_BG_LOCKS)
			
 
				-#define	MB_CACHE_ENTRY_LOCK_INDEX(ce)			\
			
 
				-	(hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
			
 
				-
			
 
				-static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
			
 
				-static struct blockgroup_lock *mb_cache_bg_lock;
			
 
				-static struct kmem_cache *mb_cache_kmem_cache;
			
 
				-
			
 
				-MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
			
 
				-MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
			
 
				-MODULE_LICENSE("GPL");
			
 
				-
			
 
				-EXPORT_SYMBOL(mb_cache_create);
			
 
				-EXPORT_SYMBOL(mb_cache_shrink);
			
 
				-EXPORT_SYMBOL(mb_cache_destroy);
			
 
				-EXPORT_SYMBOL(mb_cache_entry_alloc);
			
 
				-EXPORT_SYMBOL(mb_cache_entry_insert);
			
 
				-EXPORT_SYMBOL(mb_cache_entry_release);
			
 
				-EXPORT_SYMBOL(mb_cache_entry_free);
			
 
				-EXPORT_SYMBOL(mb_cache_entry_get);
			
 
				-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
			
 
				-EXPORT_SYMBOL(mb_cache_entry_find_first);
			
 
				-EXPORT_SYMBOL(mb_cache_entry_find_next);
			
 
				-#endif
			
 
				 
			
 
				 /*
			
 
				- * Global data: list of all mbcache's, lru list, and a spinlock for
			
 
				- * accessing cache data structures on SMP machines. The lru list is
			
 
				- * global across all mbcaches.
			
 
				+ * Mbcache is a simple key-value store. Keys need not be unique, however
			
 
				+ * key-value pairs are expected to be unique (we use this fact in
			
 
				+ * mb_cache_entry_delete_block()).
			
 
				+ *
			
 
				+ * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
			
 
				+ * They use hash of a block contents as a key and block number as a value.
			
 
				+ * That's why keys need not be unique (different xattr blocks may end up having
			
 
				+ * the same hash). However block number always uniquely identifies a cache
			
 
				+ * entry.
			
 
				+ *
			
 
				+ * We provide functions for creation and removal of entries, search by key,
			
 
				+ * and a special "delete entry with given key-value pair" operation. Fixed
			
 
				+ * size hash table is used for fast key lookups.
			
 
				  */
			
 
				 
			
 
				-static LIST_HEAD(mb_cache_list);
			
 
				-static LIST_HEAD(mb_cache_lru_list);
			
 
				-static DEFINE_SPINLOCK(mb_cache_spinlock);
			
 
				-
			
 
				-static inline void
			
 
				-__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
			
 
				-{
			
 
				-	spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
			
 
				-		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
			
 
				-}
			
 
				-
			
 
				-static inline void
			
 
				-__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
			
 
				-{
			
 
				-	spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
			
 
				-		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
			
 
				-}
			
 
				-
			
 
				-static inline int
			
 
				-__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
			
 
				-{
			
 
				-	return !hlist_bl_unhashed(&ce->e_block_list);
			
 
				-}
			
 
				+struct mb_cache {
			
 
				+	/* Hash table of entries */
			
 
				+	struct hlist_bl_head	*c_hash;
			
 
				+	/* log2 of hash table size */
			
 
				+	int			c_bucket_bits;
			
 
				+	/* Maximum entries in cache to avoid degrading hash too much */
			
 
				+	int			c_max_entries;
			
 
				+	/* Protects c_list, c_entry_count */
			
 
				+	spinlock_t		c_list_lock;
			
 
				+	struct list_head	c_list;
			
 
				+	/* Number of entries in cache */
			
 
				+	unsigned long		c_entry_count;
			
 
				+	struct shrinker		c_shrink;
			
 
				+	/* Work for shrinking when the cache has too many entries */
			
 
				+	struct work_struct	c_shrink_work;
			
 
				+};
			
 
				 
			
 
				+static struct kmem_cache *mb_entry_cache;
			
 
				 
			
 
				-static inline void
			
 
				-__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
			
 
				-{
			
 
				-	if (__mb_cache_entry_is_block_hashed(ce))
			
 
				-		hlist_bl_del_init(&ce->e_block_list);
			
 
				-}
			
 
				+static unsigned long mb_cache_shrink(struct mb_cache *cache,
			
 
				+				     unsigned int nr_to_scan);
			
 
				 
			
 
				-static inline int
			
 
				-__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
			
 
				+static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
			
 
				+							u32 key)
			
 
				 {
			
 
				-	return !hlist_bl_unhashed(&ce->e_index.o_list);
			
 
				+	return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
			
 
				 }
			
 
				 
			
 
				-static inline void
			
 
				-__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
			
 
				-{
			
 
				-	if (__mb_cache_entry_is_index_hashed(ce))
			
 
				-		hlist_bl_del_init(&ce->e_index.o_list);
			
 
				-}
			
 
				+/*
			
 
				+ * Number of entries to reclaim synchronously when there are too many entries
			
 
				+ * in cache
			
 
				+ */
			
 
				+#define SYNC_SHRINK_BATCH 64
			
 
				 
			
 
				 /*
			
 
				- * __mb_cache_entry_unhash_unlock()
			
 
				- *
			
 
				- * This function is called to unhash both the block and index hash
			
 
				- * chain.
			
 
				- * It assumes both the block and index hash chain is locked upon entry.
			
 
				- * It also unlock both hash chains both exit
			
 
				+ * mb_cache_entry_create - create entry in cache
			
 
				+ * @cache - cache where the entry should be created
			
 
				+ * @mask - gfp mask with which the entry should be allocated
			
 
				+ * @key - key of the entry
			
 
				+ * @block - block that contains data
			
 
				+ * @reusable - is the block reusable by other inodes?
			
 
				+ *
			
 
				+ * Creates entry in @cache with key @key and records that data is stored in
			
 
				+ * block @block. The function returns -EBUSY if entry with the same key
			
 
				+ * and for the same block already exists in cache. Otherwise 0 is returned.
			
 
				  */
			
 
				-static inline void
			
 
				-__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
			
 
				+int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
			
 
				+			  sector_t block, bool reusable)
			
 
				 {
			
 
				-	__mb_cache_entry_unhash_index(ce);
			
 
				-	hlist_bl_unlock(ce->e_index_hash_p);
			
 
				-	__mb_cache_entry_unhash_block(ce);
			
 
				-	hlist_bl_unlock(ce->e_block_hash_p);
			
 
				+	struct mb_cache_entry *entry, *dup;
			
 
				+	struct hlist_bl_node *dup_node;
			
 
				+	struct hlist_bl_head *head;
			
 
				+
			
 
				+	/* Schedule background reclaim if there are too many entries */
			
 
				+	if (cache->c_entry_count >= cache->c_max_entries)
			
 
				+		schedule_work(&cache->c_shrink_work);
			
 
				+	/* Do some sync reclaim if background reclaim cannot keep up */
			
 
				+	if (cache->c_entry_count >= 2*cache->c_max_entries)
			
 
				+		mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
			
 
				+
			
 
				+	entry = kmem_cache_alloc(mb_entry_cache, mask);
			
 
				+	if (!entry)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&entry->e_list);
			
 
				+	/* One ref for hash, one ref returned */
			
 
				+	atomic_set(&entry->e_refcnt, 1);
			
 
				+	entry->e_key = key;
			
 
				+	entry->e_block = block;
			
 
				+	entry->e_reusable = reusable;
			
 
				+	head = mb_cache_entry_head(cache, key);
			
 
				+	hlist_bl_lock(head);
			
 
				+	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
			
 
				+		if (dup->e_key == key && dup->e_block == block) {
			
 
				+			hlist_bl_unlock(head);
			
 
				+			kmem_cache_free(mb_entry_cache, entry);
			
 
				+			return -EBUSY;
			
 
				+		}
			
 
				+	}
			
 
				+	hlist_bl_add_head(&entry->e_hash_list, head);
			
 
				+	hlist_bl_unlock(head);
			
 
				+
			
 
				+	spin_lock(&cache->c_list_lock);
			
 
				+	list_add_tail(&entry->e_list, &cache->c_list);
			
 
				+	/* Grab ref for LRU list */
			
 
				+	atomic_inc(&entry->e_refcnt);
			
 
				+	cache->c_entry_count++;
			
 
				+	spin_unlock(&cache->c_list_lock);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				+EXPORT_SYMBOL(mb_cache_entry_create);
			
 
				 
			
 
				-static void
			
 
				-__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
			
 
				+void __mb_cache_entry_free(struct mb_cache_entry *entry)
			
 
				 {
			
 
				-	struct mb_cache *cache = ce->e_cache;
			
 
				-
			
 
				-	mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
			
 
				-	kmem_cache_free(cache->c_entry_cache, ce);
			
 
				-	atomic_dec(&cache->c_entry_count);
			
 
				+	kmem_cache_free(mb_entry_cache, entry);
			
 
				 }
			
 
				+EXPORT_SYMBOL(__mb_cache_entry_free);
			
 
				 
			
 
				-static void
			
 
				-__mb_cache_entry_release(struct mb_cache_entry *ce)
			
 
				+static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
			
 
				+					   struct mb_cache_entry *entry,
			
 
				+					   u32 key)
			
 
				 {
			
 
				-	/* First lock the entry to serialize access to its local data. */
			
 
				-	__spin_lock_mb_cache_entry(ce);
			
 
				-	/* Wake up all processes queuing for this cache entry. */
			
 
				-	if (ce->e_queued)
			
 
				-		wake_up_all(&mb_cache_queue);
			
 
				-	if (ce->e_used >= MB_CACHE_WRITER)
			
 
				-		ce->e_used -= MB_CACHE_WRITER;
			
 
				-	/*
			
 
				-	 * Make sure that all cache entries on lru_list have
			
 
				-	 * both e_used and e_qued of 0s.
			
 
				-	 */
			
 
				-	ce->e_used--;
			
 
				-	if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
			
 
				-		if (!__mb_cache_entry_is_block_hashed(ce)) {
			
 
				-			__spin_unlock_mb_cache_entry(ce);
			
 
				-			goto forget;
			
 
				+	struct mb_cache_entry *old_entry = entry;
			
 
				+	struct hlist_bl_node *node;
			
 
				+	struct hlist_bl_head *head;
			
 
				+
			
 
				+	head = mb_cache_entry_head(cache, key);
			
 
				+	hlist_bl_lock(head);
			
 
				+	if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
			
 
				+		node = entry->e_hash_list.next;
			
 
				+	else
			
 
				+		node = hlist_bl_first(head);
			
 
				+	while (node) {
			
 
				+		entry = hlist_bl_entry(node, struct mb_cache_entry,
			
 
				+				       e_hash_list);
			
 
				+		if (entry->e_key == key && entry->e_reusable) {
			
 
				+			atomic_inc(&entry->e_refcnt);
			
 
				+			goto out;
			
 
				 		}
			
 
				-		/*
			
 
				-		 * Need access to lru list, first drop entry lock,
			
 
				-		 * then reacquire the lock in the proper order.
			
 
				-		 */
			
 
				-		spin_lock(&mb_cache_spinlock);
			
 
				-		if (list_empty(&ce->e_lru_list))
			
 
				-			list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
			
 
				-		spin_unlock(&mb_cache_spinlock);
			
 
				+		node = node->next;
			
 
				 	}
			
 
				-	__spin_unlock_mb_cache_entry(ce);
			
 
				-	return;
			
 
				-forget:
			
 
				-	mb_assert(list_empty(&ce->e_lru_list));
			
 
				-	__mb_cache_entry_forget(ce, GFP_KERNEL);
			
 
				+	entry = NULL;
			
 
				+out:
			
 
				+	hlist_bl_unlock(head);
			
 
				+	if (old_entry)
			
 
				+		mb_cache_entry_put(cache, old_entry);
			
 
				+
			
 
				+	return entry;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * mb_cache_shrink_scan()  memory pressure callback
			
 
				- *
			
 
				- * This function is called by the kernel memory management when memory
			
 
				- * gets low.
			
 
				+ * mb_cache_entry_find_first - find the first entry in cache with given key
			
 
				+ * @cache: cache where we should search
			
 
				+ * @key: key to look for
			
 
				  *
			
 
				- * @shrink: (ignored)
			
 
				- * @sc: shrink_control passed from reclaim
			
 
				- *
			
 
				- * Returns the number of objects freed.
			
 
				+ * Search in @cache for entry with key @key. Grabs reference to the first
			
 
				+ * entry found and returns the entry.
			
 
				  */
			
 
				-static unsigned long
			
 
				-mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
			
 
				+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
			
 
				+						 u32 key)
			
 
				 {
			
 
				-	LIST_HEAD(free_list);
			
 
				-	struct mb_cache_entry *entry, *tmp;
			
 
				-	int nr_to_scan = sc->nr_to_scan;
			
 
				-	gfp_t gfp_mask = sc->gfp_mask;
			
 
				-	unsigned long freed = 0;
			
 
				-
			
 
				-	mb_debug("trying to free %d entries", nr_to_scan);
			
 
				-	spin_lock(&mb_cache_spinlock);
			
 
				-	while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
			
 
				-		struct mb_cache_entry *ce =
			
 
				-			list_entry(mb_cache_lru_list.next,
			
 
				-				struct mb_cache_entry, e_lru_list);
			
 
				-		list_del_init(&ce->e_lru_list);
			
 
				-		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
			
 
				-			continue;
			
 
				-		spin_unlock(&mb_cache_spinlock);
			
 
				-		/* Prevent any find or get operation on the entry */
			
 
				-		hlist_bl_lock(ce->e_block_hash_p);
			
 
				-		hlist_bl_lock(ce->e_index_hash_p);
			
 
				-		/* Ignore if it is touched by a find/get */
			
 
				-		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
			
 
				-			!list_empty(&ce->e_lru_list)) {
			
 
				-			hlist_bl_unlock(ce->e_index_hash_p);
			
 
				-			hlist_bl_unlock(ce->e_block_hash_p);
			
 
				-			spin_lock(&mb_cache_spinlock);
			
 
				-			continue;
			
 
				-		}
			
 
				-		__mb_cache_entry_unhash_unlock(ce);
			
 
				-		list_add_tail(&ce->e_lru_list, &free_list);
			
 
				-		spin_lock(&mb_cache_spinlock);
			
 
				-	}
			
 
				-	spin_unlock(&mb_cache_spinlock);
			
 
				-
			
 
				-	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
			
 
				-		__mb_cache_entry_forget(entry, gfp_mask);
			
 
				-		freed++;
			
 
				-	}
			
 
				-	return freed;
			
 
				+	return __entry_find(cache, NULL, key);
			
 
				 }
			
 
				+EXPORT_SYMBOL(mb_cache_entry_find_first);
			
 
				 
			
 
				-static unsigned long
			
 
				-mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
			
 
				+/*
			
 
				+ * mb_cache_entry_find_next - find next entry in cache with the same
			
 
				+ * @cache: cache where we should search
			
 
				+ * @entry: entry to start search from
			
 
				+ *
			
 
				+ * Finds next entry in the hash chain which has the same key as @entry.
			
 
				+ * If @entry is unhashed (which can happen when deletion of entry races
			
 
				+ * with the search), finds the first entry in the hash chain. The function
			
 
				+ * drops reference to @entry and returns with a reference to the found entry.
			
 
				+ */
			
 
				+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
			
 
				+						struct mb_cache_entry *entry)
			
 
				 {
			
 
				-	struct mb_cache *cache;
			
 
				-	unsigned long count = 0;
			
 
				-
			
 
				-	spin_lock(&mb_cache_spinlock);
			
 
				-	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
			
 
				-		mb_debug("cache %s (%d)", cache->c_name,
			
 
				-			  atomic_read(&cache->c_entry_count));
			
 
				-		count += atomic_read(&cache->c_entry_count);
			
 
				-	}
			
 
				-	spin_unlock(&mb_cache_spinlock);
			
 
				-
			
 
				-	return vfs_pressure_ratio(count);
			
 
				+	return __entry_find(cache, entry, entry->e_key);
			
 
				 }
			
 
				-
			
 
				-static struct shrinker mb_cache_shrinker = {
			
 
				-	.count_objects = mb_cache_shrink_count,
			
 
				-	.scan_objects = mb_cache_shrink_scan,
			
 
				-	.seeks = DEFAULT_SEEKS,
			
 
				-};
			
 
				+EXPORT_SYMBOL(mb_cache_entry_find_next);
			
 
				 
			
 
				 /*
			
 
				- * mb_cache_create()  create a new cache
			
 
				- *
			
 
				- * All entries in one cache are equal size. Cache entries may be from
			
 
				- * multiple devices. If this is the first mbcache created, registers
			
 
				- * the cache with kernel memory management. Returns NULL if no more
			
 
				- * memory was available.
			
 
				- *
			
 
				- * @name: name of the cache (informal)
			
 
				- * @bucket_bits: log2(number of hash buckets)
			
 
				+ * mb_cache_entry_get - get a cache entry by block number (and key)
			
 
				+ * @cache - cache we work with
			
 
				+ * @key - key of block number @block
			
 
				+ * @block - block number
			
 
				  */
			
 
				-struct mb_cache *
			
 
				-mb_cache_create(const char *name, int bucket_bits)
			
 
				+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
			
 
				+					  sector_t block)
			
 
				 {
			
 
				-	int n, bucket_count = 1 << bucket_bits;
			
 
				-	struct mb_cache *cache = NULL;
			
 
				-
			
 
				-	if (!mb_cache_bg_lock) {
			
 
				-		mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
			
 
				-			GFP_KERNEL);
			
 
				-		if (!mb_cache_bg_lock)
			
 
				-			return NULL;
			
 
				-		bgl_lock_init(mb_cache_bg_lock);
			
 
				-	}
			
 
				-
			
 
				-	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
			
 
				-	if (!cache)
			
 
				-		return NULL;
			
 
				-	cache->c_name = name;
			
 
				-	atomic_set(&cache->c_entry_count, 0);
			
 
				-	cache->c_bucket_bits = bucket_bits;
			
 
				-	cache->c_block_hash = kmalloc(bucket_count *
			
 
				-		sizeof(struct hlist_bl_head), GFP_KERNEL);
			
 
				-	if (!cache->c_block_hash)
			
 
				-		goto fail;
			
 
				-	for (n=0; n<bucket_count; n++)
			
 
				-		INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
			
 
				-	cache->c_index_hash = kmalloc(bucket_count *
			
 
				-		sizeof(struct hlist_bl_head), GFP_KERNEL);
			
 
				-	if (!cache->c_index_hash)
			
 
				-		goto fail;
			
 
				-	for (n=0; n<bucket_count; n++)
			
 
				-		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
			
 
				-	if (!mb_cache_kmem_cache) {
			
 
				-		mb_cache_kmem_cache = kmem_cache_create(name,
			
 
				-			sizeof(struct mb_cache_entry), 0,
			
 
				-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
			
 
				-		if (!mb_cache_kmem_cache)
			
 
				-			goto fail2;
			
 
				+	struct hlist_bl_node *node;
			
 
				+	struct hlist_bl_head *head;
			
 
				+	struct mb_cache_entry *entry;
			
 
				+
			
 
				+	head = mb_cache_entry_head(cache, key);
			
 
				+	hlist_bl_lock(head);
			
 
				+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
			
 
				+		if (entry->e_key == key && entry->e_block == block) {
			
 
				+			atomic_inc(&entry->e_refcnt);
			
 
				+			goto out;
			
 
				+		}
			
 
				 	}
			
 
				-	cache->c_entry_cache = mb_cache_kmem_cache;
			
 
				-
			
 
				-	/*
			
 
				-	 * Set an upper limit on the number of cache entries so that the hash
			
 
				-	 * chains won't grow too long.
			
 
				-	 */
			
 
				-	cache->c_max_entries = bucket_count << 4;
			
 
				-
			
 
				-	spin_lock(&mb_cache_spinlock);
			
 
				-	list_add(&cache->c_cache_list, &mb_cache_list);
			
 
				-	spin_unlock(&mb_cache_spinlock);
			
 
				-	return cache;
			
 
				-
			
 
				-fail2:
			
 
				-	kfree(cache->c_index_hash);
			
 
				-
			
 
				-fail:
			
 
				-	kfree(cache->c_block_hash);
			
 
				-	kfree(cache);
			
 
				-	return NULL;
			
 
				+	entry = NULL;
			
 
				+out:
			
 
				+	hlist_bl_unlock(head);
			
 
				+	return entry;
			
 
				 }
			
 
				+EXPORT_SYMBOL(mb_cache_entry_get);
			
 
				 
			
 
				-
			
 
				-/*
			
 
				- * mb_cache_shrink()
			
 
				- *
			
 
				- * Removes all cache entries of a device from the cache. All cache entries
			
 
				- * currently in use cannot be freed, and thus remain in the cache. All others
			
 
				- * are freed.
			
 
				+/* mb_cache_entry_delete_block - remove information about block from cache
			
 
				+ * @cache - cache we work with
			
 
				+ * @key - key of block @block
			
 
				+ * @block - block number
			
 
				  *
			
 
				- * @bdev: which device's cache entries to shrink
			
 
				+ * Remove entry from cache @cache with key @key with data stored in @block.
			
 
				  */
			
 
				-void
			
 
				-mb_cache_shrink(struct block_device *bdev)
			
 
				+void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
			
 
				+				 sector_t block)
			
 
				 {
			
 
				-	LIST_HEAD(free_list);
			
 
				-	struct list_head *l;
			
 
				-	struct mb_cache_entry *ce, *tmp;
			
 
				-
			
 
				-	l = &mb_cache_lru_list;
			
 
				-	spin_lock(&mb_cache_spinlock);
			
 
				-	while (!list_is_last(l, &mb_cache_lru_list)) {
			
 
				-		l = l->next;
			
 
				-		ce = list_entry(l, struct mb_cache_entry, e_lru_list);
			
 
				-		if (ce->e_bdev == bdev) {
			
 
				-			list_del_init(&ce->e_lru_list);
			
 
				-			if (ce->e_used || ce->e_queued ||
			
 
				-				atomic_read(&ce->e_refcnt))
			
 
				-				continue;
			
 
				-			spin_unlock(&mb_cache_spinlock);
			
 
				-			/*
			
 
				-			 * Prevent any find or get operation on the entry.
			
 
				-			 */
			
 
				-			hlist_bl_lock(ce->e_block_hash_p);
			
 
				-			hlist_bl_lock(ce->e_index_hash_p);
			
 
				-			/* Ignore if it is touched by a find/get */
			
 
				-			if (ce->e_used || ce->e_queued ||
			
 
				-				atomic_read(&ce->e_refcnt) ||
			
 
				-				!list_empty(&ce->e_lru_list)) {
			
 
				-				hlist_bl_unlock(ce->e_index_hash_p);
			
 
				-				hlist_bl_unlock(ce->e_block_hash_p);
			
 
				-				l = &mb_cache_lru_list;
			
 
				-				spin_lock(&mb_cache_spinlock);
			
 
				-				continue;
			
 
				+	struct hlist_bl_node *node;
			
 
				+	struct hlist_bl_head *head;
			
 
				+	struct mb_cache_entry *entry;
			
 
				+
			
 
				+	head = mb_cache_entry_head(cache, key);
			
 
				+	hlist_bl_lock(head);
			
 
				+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
			
 
				+		if (entry->e_key == key && entry->e_block == block) {
			
 
				+			/* We keep hash list reference to keep entry alive */
			
 
				+			hlist_bl_del_init(&entry->e_hash_list);
			
 
				+			hlist_bl_unlock(head);
			
 
				+			spin_lock(&cache->c_list_lock);
			
 
				+			if (!list_empty(&entry->e_list)) {
			
 
				+				list_del_init(&entry->e_list);
			
 
				+				cache->c_entry_count--;
			
 
				+				atomic_dec(&entry->e_refcnt);
			
 
				 			}
			
 
				-			__mb_cache_entry_unhash_unlock(ce);
			
 
				-			mb_assert(!(ce->e_used || ce->e_queued ||
			
 
				-				atomic_read(&ce->e_refcnt)));
			
 
				-			list_add_tail(&ce->e_lru_list, &free_list);
			
 
				-			l = &mb_cache_lru_list;
			
 
				-			spin_lock(&mb_cache_spinlock);
			
 
				+			spin_unlock(&cache->c_list_lock);
			
 
				+			mb_cache_entry_put(cache, entry);
			
 
				+			return;
			
 
				 		}
			
 
				 	}
			
 
				-	spin_unlock(&mb_cache_spinlock);
			
 
				-
			
 
				-	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
			
 
				-		__mb_cache_entry_forget(ce, GFP_KERNEL);
			
 
				-	}
			
 
				+	hlist_bl_unlock(head);
			
 
				 }
			
 
				+EXPORT_SYMBOL(mb_cache_entry_delete_block);
			
 
				 
			
 
				-
			
 
				-/*
			
 
				- * mb_cache_destroy()
			
 
				+/* mb_cache_entry_touch - cache entry got used
			
 
				+ * @cache - cache the entry belongs to
			
 
				+ * @entry - entry that got used
			
 
				  *
			
 
				- * Shrinks the cache to its minimum possible size (hopefully 0 entries),
			
 
				- * and then destroys it. If this was the last mbcache, un-registers the
			
 
				- * mbcache from kernel memory management.
			
 
				+ * Marks entry as used to give hit higher chances of surviving in cache.
			
 
				  */
			
 
				-void
			
 
				-mb_cache_destroy(struct mb_cache *cache)
			
 
				+void mb_cache_entry_touch(struct mb_cache *cache,
			
 
				+			  struct mb_cache_entry *entry)
			
 
				 {
			
 
				-	LIST_HEAD(free_list);
			
 
				-	struct mb_cache_entry *ce, *tmp;
			
 
				-
			
 
				-	spin_lock(&mb_cache_spinlock);
			
 
				-	list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
			
 
				-		if (ce->e_cache == cache)
			
 
				-			list_move_tail(&ce->e_lru_list, &free_list);
			
 
				-	}
			
 
				-	list_del(&cache->c_cache_list);
			
 
				-	spin_unlock(&mb_cache_spinlock);
			
 
				-
			
 
				-	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
			
 
				-		list_del_init(&ce->e_lru_list);
			
 
				-		/*
			
 
				-		 * Prevent any find or get operation on the entry.
			
 
				-		 */
			
 
				-		hlist_bl_lock(ce->e_block_hash_p);
			
 
				-		hlist_bl_lock(ce->e_index_hash_p);
			
 
				-		mb_assert(!(ce->e_used || ce->e_queued ||
			
 
				-			atomic_read(&ce->e_refcnt)));
			
 
				-		__mb_cache_entry_unhash_unlock(ce);
			
 
				-		__mb_cache_entry_forget(ce, GFP_KERNEL);
			
 
				-	}
			
 
				-
			
 
				-	if (atomic_read(&cache->c_entry_count) > 0) {
			
 
				-		mb_error("cache %s: %d orphaned entries",
			
 
				-			  cache->c_name,
			
 
				-			  atomic_read(&cache->c_entry_count));
			
 
				-	}
			
 
				-
			
 
				-	if (list_empty(&mb_cache_list)) {
			
 
				-		kmem_cache_destroy(mb_cache_kmem_cache);
			
 
				-		mb_cache_kmem_cache = NULL;
			
 
				-	}
			
 
				-	kfree(cache->c_index_hash);
			
 
				-	kfree(cache->c_block_hash);
			
 
				-	kfree(cache);
			
 
				+	entry->e_referenced = 1;
			
 
				 }
			
 
				+EXPORT_SYMBOL(mb_cache_entry_touch);
			
 
				 
			
 
				-/*
			
 
				- * mb_cache_entry_alloc()
			
 
				- *
			
 
				- * Allocates a new cache entry. The new entry will not be valid initially,
			
 
				- * and thus cannot be looked up yet. It should be filled with data, and
			
 
				- * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
			
 
				- * if no more memory was available.
			
 
				- */
			
 
				-struct mb_cache_entry *
			
 
				-mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
			
 
				+static unsigned long mb_cache_count(struct shrinker *shrink,
			
 
				+				    struct shrink_control *sc)
			
 
				 {
			
 
				-	struct mb_cache_entry *ce;
			
 
				-
			
 
				-	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
			
 
				-		struct list_head *l;
			
 
				-
			
 
				-		l = &mb_cache_lru_list;
			
 
				-		spin_lock(&mb_cache_spinlock);
			
 
				-		while (!list_is_last(l, &mb_cache_lru_list)) {
			
 
				-			l = l->next;
			
 
				-			ce = list_entry(l, struct mb_cache_entry, e_lru_list);
			
 
				-			if (ce->e_cache == cache) {
			
 
				-				list_del_init(&ce->e_lru_list);
			
 
				-				if (ce->e_used || ce->e_queued ||
			
 
				-					atomic_read(&ce->e_refcnt))
			
 
				-					continue;
			
 
				-				spin_unlock(&mb_cache_spinlock);
			
 
				-				/*
			
 
				-				 * Prevent any find or get operation on the
			
 
				-				 * entry.
			
 
				-				 */
			
 
				-				hlist_bl_lock(ce->e_block_hash_p);
			
 
				-				hlist_bl_lock(ce->e_index_hash_p);
			
 
				-				/* Ignore if it is touched by a find/get */
			
 
				-				if (ce->e_used || ce->e_queued ||
			
 
				-					atomic_read(&ce->e_refcnt) ||
			
 
				-					!list_empty(&ce->e_lru_list)) {
			
 
				-					hlist_bl_unlock(ce->e_index_hash_p);
			
 
				-					hlist_bl_unlock(ce->e_block_hash_p);
			
 
				-					l = &mb_cache_lru_list;
			
 
				-					spin_lock(&mb_cache_spinlock);
			
 
				-					continue;
			
 
				-				}
			
 
				-				mb_assert(list_empty(&ce->e_lru_list));
			
 
				-				mb_assert(!(ce->e_used || ce->e_queued ||
			
 
				-					atomic_read(&ce->e_refcnt)));
			
 
				-				__mb_cache_entry_unhash_unlock(ce);
			
 
				-				goto found;
			
 
				-			}
			
 
				-		}
			
 
				-		spin_unlock(&mb_cache_spinlock);
			
 
				-	}
			
 
				+	struct mb_cache *cache = container_of(shrink, struct mb_cache,
			
 
				+					      c_shrink);
			
 
				 
			
 
				-	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
			
 
				-	if (!ce)
			
 
				-		return NULL;
			
 
				-	atomic_inc(&cache->c_entry_count);
			
 
				-	INIT_LIST_HEAD(&ce->e_lru_list);
			
 
				-	INIT_HLIST_BL_NODE(&ce->e_block_list);
			
 
				-	INIT_HLIST_BL_NODE(&ce->e_index.o_list);
			
 
				-	ce->e_cache = cache;
			
 
				-	ce->e_queued = 0;
			
 
				-	atomic_set(&ce->e_refcnt, 0);
			
 
				-found:
			
 
				-	ce->e_block_hash_p = &cache->c_block_hash[0];
			
 
				-	ce->e_index_hash_p = &cache->c_index_hash[0];
			
 
				-	ce->e_used = 1 + MB_CACHE_WRITER;
			
 
				-	return ce;
			
 
				+	return cache->c_entry_count;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-/*
			
 
				- * mb_cache_entry_insert()
			
 
				- *
			
 
				- * Inserts an entry that was allocated using mb_cache_entry_alloc() into
			
 
				- * the cache. After this, the cache entry can be looked up, but is not yet
			
 
				- * in the lru list as the caller still holds a handle to it. Returns 0 on
			
 
				- * success, or -EBUSY if a cache entry for that device + inode exists
			
 
				- * already (this may happen after a failed lookup, but when another process
			
 
				- * has inserted the same cache entry in the meantime).
			
 
				- *
			
 
				- * @bdev: device the cache entry belongs to
			
 
				- * @block: block number
			
 
				- * @key: lookup key
			
 
				- */
			
 
				-int
			
 
				-mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
			
 
				-		      sector_t block, unsigned int key)
			
 
				+/* Shrink number of entries in cache */
			
 
				+static unsigned long mb_cache_shrink(struct mb_cache *cache,
			
 
				+				     unsigned int nr_to_scan)
			
 
				 {
			
 
				-	struct mb_cache *cache = ce->e_cache;
			
 
				-	unsigned int bucket;
			
 
				-	struct hlist_bl_node *l;
			
 
				-	struct hlist_bl_head *block_hash_p;
			
 
				-	struct hlist_bl_head *index_hash_p;
			
 
				-	struct mb_cache_entry *lce;
			
 
				-
			
 
				-	mb_assert(ce);
			
 
				-	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
			
 
				-			   cache->c_bucket_bits);
			
 
				-	block_hash_p = &cache->c_block_hash[bucket];
			
 
				-	hlist_bl_lock(block_hash_p);
			
 
				-	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
			
 
				-		if (lce->e_bdev == bdev && lce->e_block == block) {
			
 
				-			hlist_bl_unlock(block_hash_p);
			
 
				-			return -EBUSY;
			
 
				+	struct mb_cache_entry *entry;
			
 
				+	struct hlist_bl_head *head;
			
 
				+	unsigned int shrunk = 0;
			
 
				+
			
 
				+	spin_lock(&cache->c_list_lock);
			
 
				+	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
			
 
				+		entry = list_first_entry(&cache->c_list,
			
 
				+					 struct mb_cache_entry, e_list);
			
 
				+		if (entry->e_referenced) {
			
 
				+			entry->e_referenced = 0;
			
 
				+			list_move_tail(&cache->c_list, &entry->e_list);
			
 
				+			continue;
			
 
				 		}
			
 
				+		list_del_init(&entry->e_list);
			
 
				+		cache->c_entry_count--;
			
 
				+		/*
			
 
				+		 * We keep LRU list reference so that entry doesn't go away
			
 
				+		 * from under us.
			
 
				+		 */
			
 
				+		spin_unlock(&cache->c_list_lock);
			
 
				+		head = mb_cache_entry_head(cache, entry->e_key);
			
 
				+		hlist_bl_lock(head);
			
 
				+		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
			
 
				+			hlist_bl_del_init(&entry->e_hash_list);
			
 
				+			atomic_dec(&entry->e_refcnt);
			
 
				+		}
			
 
				+		hlist_bl_unlock(head);
			
 
				+		if (mb_cache_entry_put(cache, entry))
			
 
				+			shrunk++;
			
 
				+		cond_resched();
			
 
				+		spin_lock(&cache->c_list_lock);
			
 
				 	}
			
 
				-	mb_assert(!__mb_cache_entry_is_block_hashed(ce));
			
 
				-	__mb_cache_entry_unhash_block(ce);
			
 
				-	__mb_cache_entry_unhash_index(ce);
			
 
				-	ce->e_bdev = bdev;
			
 
				-	ce->e_block = block;
			
 
				-	ce->e_block_hash_p = block_hash_p;
			
 
				-	ce->e_index.o_key = key;
			
 
				-	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
			
 
				-	hlist_bl_unlock(block_hash_p);
			
 
				-	bucket = hash_long(key, cache->c_bucket_bits);
			
 
				-	index_hash_p = &cache->c_index_hash[bucket];
			
 
				-	hlist_bl_lock(index_hash_p);
			
 
				-	ce->e_index_hash_p = index_hash_p;
			
 
				-	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
			
 
				-	hlist_bl_unlock(index_hash_p);
			
 
				-	return 0;
			
 
				-}
			
 
				+	spin_unlock(&cache->c_list_lock);
			
 
				 
			
 
				+	return shrunk;
			
 
				+}
			
 
				 
			
 
				-/*
			
 
				- * mb_cache_entry_release()
			
 
				- *
			
 
				- * Release a handle to a cache entry. When the last handle to a cache entry
			
 
				- * is released it is either freed (if it is invalid) or otherwise inserted
			
 
				- * in to the lru list.
			
 
				- */
			
 
				-void
			
 
				-mb_cache_entry_release(struct mb_cache_entry *ce)
			
 
				+static unsigned long mb_cache_scan(struct shrinker *shrink,
			
 
				+				   struct shrink_control *sc)
			
 
				 {
			
 
				-	__mb_cache_entry_release(ce);
			
 
				+	int nr_to_scan = sc->nr_to_scan;
			
 
				+	struct mb_cache *cache = container_of(shrink, struct mb_cache,
			
 
				+					      c_shrink);
			
 
				+	return mb_cache_shrink(cache, nr_to_scan);
			
 
				 }
			
 
				 
			
 
				+/* We shrink 1/X of the cache when we have too many entries in it */
			
 
				+#define SHRINK_DIVISOR 16
			
 
				 
			
 
				-/*
			
 
				- * mb_cache_entry_free()
			
 
				- *
			
 
				- */
			
 
				-void
			
 
				-mb_cache_entry_free(struct mb_cache_entry *ce)
			
 
				+static void mb_cache_shrink_worker(struct work_struct *work)
			
 
				 {
			
 
				-	mb_assert(ce);
			
 
				-	mb_assert(list_empty(&ce->e_lru_list));
			
 
				-	hlist_bl_lock(ce->e_index_hash_p);
			
 
				-	__mb_cache_entry_unhash_index(ce);
			
 
				-	hlist_bl_unlock(ce->e_index_hash_p);
			
 
				-	hlist_bl_lock(ce->e_block_hash_p);
			
 
				-	__mb_cache_entry_unhash_block(ce);
			
 
				-	hlist_bl_unlock(ce->e_block_hash_p);
			
 
				-	__mb_cache_entry_release(ce);
			
 
				+	struct mb_cache *cache = container_of(work, struct mb_cache,
			
 
				+					      c_shrink_work);
			
 
				+	mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /*
			
 
				- * mb_cache_entry_get()
			
 
				+ * mb_cache_create - create cache
			
 
				+ * @bucket_bits: log2 of the hash table size
			
 
				  *
			
 
				- * Get a cache entry  by device / block number. (There can only be one entry
			
 
				- * in the cache per device and block.) Returns NULL if no such cache entry
			
 
				- * exists. The returned cache entry is locked for exclusive access ("single
			
 
				- * writer").
			
 
				+ * Create cache for keys with 2^bucket_bits hash entries.
			
 
				  */
			
 
				-struct mb_cache_entry *
			
 
				-mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
			
 
				-		   sector_t block)
			
 
				+struct mb_cache *mb_cache_create(int bucket_bits)
			
 
				 {
			
 
				-	unsigned int bucket;
			
 
				-	struct hlist_bl_node *l;
			
 
				-	struct mb_cache_entry *ce;
			
 
				-	struct hlist_bl_head *block_hash_p;
			
 
				-
			
 
				-	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
			
 
				-			   cache->c_bucket_bits);
			
 
				-	block_hash_p = &cache->c_block_hash[bucket];
			
 
				-	/* First serialize access to the block corresponding hash chain. */
			
 
				-	hlist_bl_lock(block_hash_p);
			
 
				-	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
			
 
				-		mb_assert(ce->e_block_hash_p == block_hash_p);
			
 
				-		if (ce->e_bdev == bdev && ce->e_block == block) {
			
 
				-			/*
			
 
				-			 * Prevent a free from removing the entry.
			
 
				-			 */
			
 
				-			atomic_inc(&ce->e_refcnt);
			
 
				-			hlist_bl_unlock(block_hash_p);
			
 
				-			__spin_lock_mb_cache_entry(ce);
			
 
				-			atomic_dec(&ce->e_refcnt);
			
 
				-			if (ce->e_used > 0) {
			
 
				-				DEFINE_WAIT(wait);
			
 
				-				while (ce->e_used > 0) {
			
 
				-					ce->e_queued++;
			
 
				-					prepare_to_wait(&mb_cache_queue, &wait,
			
 
				-							TASK_UNINTERRUPTIBLE);
			
 
				-					__spin_unlock_mb_cache_entry(ce);
			
 
				-					schedule();
			
 
				-					__spin_lock_mb_cache_entry(ce);
			
 
				-					ce->e_queued--;
			
 
				-				}
			
 
				-				finish_wait(&mb_cache_queue, &wait);
			
 
				-			}
			
 
				-			ce->e_used += 1 + MB_CACHE_WRITER;
			
 
				-			__spin_unlock_mb_cache_entry(ce);
			
 
				+	struct mb_cache *cache;
			
 
				+	int bucket_count = 1 << bucket_bits;
			
 
				+	int i;
			
 
				 
			
 
				-			if (!list_empty(&ce->e_lru_list)) {
			
 
				-				spin_lock(&mb_cache_spinlock);
			
 
				-				list_del_init(&ce->e_lru_list);
			
 
				-				spin_unlock(&mb_cache_spinlock);
			
 
				-			}
			
 
				-			if (!__mb_cache_entry_is_block_hashed(ce)) {
			
 
				-				__mb_cache_entry_release(ce);
			
 
				-				return NULL;
			
 
				-			}
			
 
				-			return ce;
			
 
				-		}
			
 
				+	if (!try_module_get(THIS_MODULE))
			
 
				+		return NULL;
			
 
				+
			
 
				+	cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
			
 
				+	if (!cache)
			
 
				+		goto err_out;
			
 
				+	cache->c_bucket_bits = bucket_bits;
			
 
				+	cache->c_max_entries = bucket_count << 4;
			
 
				+	INIT_LIST_HEAD(&cache->c_list);
			
 
				+	spin_lock_init(&cache->c_list_lock);
			
 
				+	cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
			
 
				+				GFP_KERNEL);
			
 
				+	if (!cache->c_hash) {
			
 
				+		kfree(cache);
			
 
				+		goto err_out;
			
 
				 	}
			
 
				-	hlist_bl_unlock(block_hash_p);
			
 
				-	return NULL;
			
 
				-}
			
 
				+	for (i = 0; i < bucket_count; i++)
			
 
				+		INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
			
 
				 
			
 
				-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
			
 
				+	cache->c_shrink.count_objects = mb_cache_count;
			
 
				+	cache->c_shrink.scan_objects = mb_cache_scan;
			
 
				+	cache->c_shrink.seeks = DEFAULT_SEEKS;
			
 
				+	register_shrinker(&cache->c_shrink);
			
 
				 
			
 
				-static struct mb_cache_entry *
			
 
				-__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
			
 
				-		      struct block_device *bdev, unsigned int key)
			
 
				-{
			
 
				+	INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
			
 
				 
			
 
				-	/* The index hash chain is alredy acquire by caller. */
			
 
				-	while (l != NULL) {
			
 
				-		struct mb_cache_entry *ce =
			
 
				-			hlist_bl_entry(l, struct mb_cache_entry,
			
 
				-				e_index.o_list);
			
 
				-		mb_assert(ce->e_index_hash_p == head);
			
 
				-		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
			
 
				-			/*
			
 
				-			 * Prevent a free from removing the entry.
			
 
				-			 */
			
 
				-			atomic_inc(&ce->e_refcnt);
			
 
				-			hlist_bl_unlock(head);
			
 
				-			__spin_lock_mb_cache_entry(ce);
			
 
				-			atomic_dec(&ce->e_refcnt);
			
 
				-			ce->e_used++;
			
 
				-			/* Incrementing before holding the lock gives readers
			
 
				-			   priority over writers. */
			
 
				-			if (ce->e_used >= MB_CACHE_WRITER) {
			
 
				-				DEFINE_WAIT(wait);
			
 
				-
			
 
				-				while (ce->e_used >= MB_CACHE_WRITER) {
			
 
				-					ce->e_queued++;
			
 
				-					prepare_to_wait(&mb_cache_queue, &wait,
			
 
				-							TASK_UNINTERRUPTIBLE);
			
 
				-					__spin_unlock_mb_cache_entry(ce);
			
 
				-					schedule();
			
 
				-					__spin_lock_mb_cache_entry(ce);
			
 
				-					ce->e_queued--;
			
 
				-				}
			
 
				-				finish_wait(&mb_cache_queue, &wait);
			
 
				-			}
			
 
				-			__spin_unlock_mb_cache_entry(ce);
			
 
				-			if (!list_empty(&ce->e_lru_list)) {
			
 
				-				spin_lock(&mb_cache_spinlock);
			
 
				-				list_del_init(&ce->e_lru_list);
			
 
				-				spin_unlock(&mb_cache_spinlock);
			
 
				-			}
			
 
				-			if (!__mb_cache_entry_is_block_hashed(ce)) {
			
 
				-				__mb_cache_entry_release(ce);
			
 
				-				return ERR_PTR(-EAGAIN);
			
 
				-			}
			
 
				-			return ce;
			
 
				-		}
			
 
				-		l = l->next;
			
 
				-	}
			
 
				-	hlist_bl_unlock(head);
			
 
				+	return cache;
			
 
				+
			
 
				+err_out:
			
 
				+	module_put(THIS_MODULE);
			
 
				 	return NULL;
			
 
				 }
			
 
				-
			
 
				+EXPORT_SYMBOL(mb_cache_create);
			
 
				 
			
 
				 /*
			
 
				- * mb_cache_entry_find_first()
			
 
				- *
			
 
				- * Find the first cache entry on a given device with a certain key in
			
 
				- * an additional index. Additional matches can be found with
			
 
				- * mb_cache_entry_find_next(). Returns NULL if no match was found. The
			
 
				- * returned cache entry is locked for shared access ("multiple readers").
			
 
				+ * mb_cache_destroy - destroy cache
			
 
				+ * @cache: the cache to destroy
			
 
				  *
			
 
				- * @cache: the cache to search
			
 
				- * @bdev: the device the cache entry should belong to
			
 
				- * @key: the key in the index
			
 
				+ * Free all entries in cache and cache itself. Caller must make sure nobody
			
 
				+ * (except shrinker) can reach @cache when calling this.
			
 
				  */
			
 
				-struct mb_cache_entry *
			
 
				-mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
			
 
				-			  unsigned int key)
			
 
				+void mb_cache_destroy(struct mb_cache *cache)
			
 
				 {
			
 
				-	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
			
 
				-	struct hlist_bl_node *l;
			
 
				-	struct mb_cache_entry *ce = NULL;
			
 
				-	struct hlist_bl_head *index_hash_p;
			
 
				-
			
 
				-	index_hash_p = &cache->c_index_hash[bucket];
			
 
				-	hlist_bl_lock(index_hash_p);
			
 
				-	if (!hlist_bl_empty(index_hash_p)) {
			
 
				-		l = hlist_bl_first(index_hash_p);
			
 
				-		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
			
 
				-	} else
			
 
				-		hlist_bl_unlock(index_hash_p);
			
 
				-	return ce;
			
 
				-}
			
 
				+	struct mb_cache_entry *entry, *next;
			
 
				 
			
 
				+	unregister_shrinker(&cache->c_shrink);
			
 
				 
			
 
				-/*
			
 
				- * mb_cache_entry_find_next()
			
 
				- *
			
 
				- * Find the next cache entry on a given device with a certain key in an
			
 
				- * additional index. Returns NULL if no match could be found. The previous
			
 
				- * entry is atomatically released, so that mb_cache_entry_find_next() can
			
 
				- * be called like this:
			
 
				- *
			
 
				- * entry = mb_cache_entry_find_first();
			
 
				- * while (entry) {
			
 
				- * 	...
			
 
				- *	entry = mb_cache_entry_find_next(entry, ...);
			
 
				- * }
			
 
				- *
			
 
				- * @prev: The previous match
			
 
				- * @bdev: the device the cache entry should belong to
			
 
				- * @key: the key in the index
			
 
				- */
			
 
				-struct mb_cache_entry *
			
 
				-mb_cache_entry_find_next(struct mb_cache_entry *prev,
			
 
				-			 struct block_device *bdev, unsigned int key)
			
 
				-{
			
 
				-	struct mb_cache *cache = prev->e_cache;
			
 
				-	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
			
 
				-	struct hlist_bl_node *l;
			
 
				-	struct mb_cache_entry *ce;
			
 
				-	struct hlist_bl_head *index_hash_p;
			
 
				-
			
 
				-	index_hash_p = &cache->c_index_hash[bucket];
			
 
				-	mb_assert(prev->e_index_hash_p == index_hash_p);
			
 
				-	hlist_bl_lock(index_hash_p);
			
 
				-	mb_assert(!hlist_bl_empty(index_hash_p));
			
 
				-	l = prev->e_index.o_list.next;
			
 
				-	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
			
 
				-	__mb_cache_entry_release(prev);
			
 
				-	return ce;
			
 
				+	/*
			
 
				+	 * We don't bother with any locking. Cache must not be used at this
			
 
				+	 * point.
			
 
				+	 */
			
 
				+	list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
			
 
				+		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
			
 
				+			hlist_bl_del_init(&entry->e_hash_list);
			
 
				+			atomic_dec(&entry->e_refcnt);
			
 
				+		} else
			
 
				+			WARN_ON(1);
			
 
				+		list_del(&entry->e_list);
			
 
				+		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
			
 
				+		mb_cache_entry_put(cache, entry);
			
 
				+	}
			
 
				+	kfree(cache->c_hash);
			
 
				+	kfree(cache);
			
 
				+	module_put(THIS_MODULE);
			
 
				 }
			
 
				+EXPORT_SYMBOL(mb_cache_destroy);
			
 
				 
			
 
				-#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
			
 
				-
			
 
				-static int __init init_mbcache(void)
			
 
				+static int __init mbcache_init(void)
			
 
				 {
			
 
				-	register_shrinker(&mb_cache_shrinker);
			
 
				+	mb_entry_cache = kmem_cache_create("mbcache",
			
 
				+				sizeof(struct mb_cache_entry), 0,
			
 
				+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
			
 
				+	BUG_ON(!mb_entry_cache);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void __exit exit_mbcache(void)
			
 
				+static void __exit mbcache_exit(void)
			
 
				 {
			
 
				-	unregister_shrinker(&mb_cache_shrinker);
			
 
				+	kmem_cache_destroy(mb_entry_cache);
			
 
				 }
			
 
				 
			
 
				-module_init(init_mbcache)
			
 
				-module_exit(exit_mbcache)
			
 
				+module_init(mbcache_init)
			
 
				+module_exit(mbcache_exit)
			
 
				 
			
 
				+MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
			
 
				+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -200,7 +200,7 @@ typedef struct journal_block_tag_s
 
				 	__be32		t_blocknr_high; /* most-significant high 32bits. */
			
 
				 } journal_block_tag_t;
			
 
				 
			
 
				-/* Tail of descriptor block, for checksumming */
			
 
				+/* Tail of descriptor or revoke block, for checksumming */
			
 
				 struct jbd2_journal_block_tail {
			
 
				 	__be32		t_checksum;	/* crc32c(uuid+descr_block) */
			
 
				 };
			
@@ -215,11 +215,6 @@ typedef struct jbd2_journal_revoke_header_s
 
				 	__be32		 r_count;	/* Count of bytes used in the block */
			
 
				 } jbd2_journal_revoke_header_t;
			
 
				 
			
 
				-/* Tail of revoke block, for checksumming */
			
 
				-struct jbd2_journal_revoke_tail {
			
 
				-	__be32		r_checksum;	/* crc32c(uuid+revoke_block) */
			
 
				-};
			
 
				-
			
 
				 /* Definitions for the journal tag flags word: */
			
 
				 #define JBD2_FLAG_ESCAPE		1	/* on-disk block is escaped */
			
 
				 #define JBD2_FLAG_SAME_UUID	2	/* block has same uuid as previous */
			
@@ -1137,7 +1132,8 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
 
				 }
			
 
				 
			
 
				 /* Log buffer allocation */
			
 
				-struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
			
 
				+struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int);
			
 
				+void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *);
			
 
				 int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
			
 
				 int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
			
 
				 			      unsigned long *block);
			
@@ -1327,10 +1323,8 @@ extern int	   jbd2_journal_init_revoke_caches(void);
 
				 extern void	   jbd2_journal_destroy_revoke(journal_t *);
			
 
				 extern int	   jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
			
 
				 extern int	   jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
			
 
				-extern void	   jbd2_journal_write_revoke_records(journal_t *journal,
			
 
				-						     transaction_t *transaction,
			
 
				-						     struct list_head *log_bufs,
			
 
				-						     int write_op);
			
 
				+extern void	   jbd2_journal_write_revoke_records(transaction_t *transaction,
			
 
				+						     struct list_head *log_bufs);
			
 
				 
			
 
				 /* Recovery revoke support */
			
 
				 extern int	jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
			
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -1,55 +1,52 @@
 
				-/*
			
 
				-  File: linux/mbcache.h
			
 
				+#ifndef _LINUX_MBCACHE_H
			
 
				+#define _LINUX_MBCACHE_H
			
 
				 
			
 
				-  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
			
 
				-*/
			
 
				-struct mb_cache_entry {
			
 
				-	struct list_head		e_lru_list;
			
 
				-	struct mb_cache			*e_cache;
			
 
				-	unsigned short			e_used;
			
 
				-	unsigned short			e_queued;
			
 
				-	atomic_t			e_refcnt;
			
 
				-	struct block_device		*e_bdev;
			
 
				-	sector_t			e_block;
			
 
				-	struct hlist_bl_node		e_block_list;
			
 
				-	struct {
			
 
				-		struct hlist_bl_node	o_list;
			
 
				-		unsigned int		o_key;
			
 
				-	} e_index;
			
 
				-	struct hlist_bl_head		*e_block_hash_p;
			
 
				-	struct hlist_bl_head		*e_index_hash_p;
			
 
				-};
			
 
				+#include <linux/hash.h>
			
 
				+#include <linux/list_bl.h>
			
 
				+#include <linux/list.h>
			
 
				+#include <linux/atomic.h>
			
 
				+#include <linux/fs.h>
			
 
				 
			
 
				-struct mb_cache {
			
 
				-	struct list_head		c_cache_list;
			
 
				-	const char			*c_name;
			
 
				-	atomic_t			c_entry_count;
			
 
				-	int				c_max_entries;
			
 
				-	int				c_bucket_bits;
			
 
				-	struct kmem_cache		*c_entry_cache;
			
 
				-	struct hlist_bl_head		*c_block_hash;
			
 
				-	struct hlist_bl_head		*c_index_hash;
			
 
				-};
			
 
				+struct mb_cache;
			
 
				 
			
 
				-/* Functions on caches */
			
 
				+struct mb_cache_entry {
			
 
				+	/* List of entries in cache - protected by cache->c_list_lock */
			
 
				+	struct list_head	e_list;
			
 
				+	/* Hash table list - protected by hash chain bitlock */
			
 
				+	struct hlist_bl_node	e_hash_list;
			
 
				+	atomic_t		e_refcnt;
			
 
				+	/* Key in hash - stable during lifetime of the entry */
			
 
				+	u32			e_key;
			
 
				+	u32			e_referenced:1;
			
 
				+	u32			e_reusable:1;
			
 
				+	/* Block number of hashed block - stable during lifetime of the entry */
			
 
				+	sector_t		e_block;
			
 
				+};
			
 
				 
			
 
				-struct mb_cache *mb_cache_create(const char *, int);
			
 
				-void mb_cache_shrink(struct block_device *);
			
 
				-void mb_cache_destroy(struct mb_cache *);
			
 
				+struct mb_cache *mb_cache_create(int bucket_bits);
			
 
				+void mb_cache_destroy(struct mb_cache *cache);
			
 
				 
			
 
				-/* Functions on cache entries */
			
 
				+int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
			
 
				+			  sector_t block, bool reusable);
			
 
				+void __mb_cache_entry_free(struct mb_cache_entry *entry);
			
 
				+static inline int mb_cache_entry_put(struct mb_cache *cache,
			
 
				+				     struct mb_cache_entry *entry)
			
 
				+{
			
 
				+	if (!atomic_dec_and_test(&entry->e_refcnt))
			
 
				+		return 0;
			
 
				+	__mb_cache_entry_free(entry);
			
 
				+	return 1;
			
 
				+}
			
 
				 
			
 
				-struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t);
			
 
				-int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *,
			
 
				-			  sector_t, unsigned int);
			
 
				-void mb_cache_entry_release(struct mb_cache_entry *);
			
 
				-void mb_cache_entry_free(struct mb_cache_entry *);
			
 
				-struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *,
			
 
				-					  struct block_device *,
			
 
				-					  sector_t);
			
 
				+void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
			
 
				+				  sector_t block);
			
 
				+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
			
 
				+					  sector_t block);
			
 
				 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
			
 
				-						 struct block_device *, 
			
 
				-						 unsigned int);
			
 
				-struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *,
			
 
				-						struct block_device *, 
			
 
				-						unsigned int);
			
 
				+						 u32 key);
			
 
				+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
			
 
				+						struct mb_cache_entry *entry);
			
 
				+void mb_cache_entry_touch(struct mb_cache *cache,
			
 
				+			  struct mb_cache_entry *entry);
			
 
				+
			
 
				+#endif	/* _LINUX_MBCACHE_H */