14 лет назад · 6a6662ced4
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 
				 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
			
 
				 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
			
 
				 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
			
 
				-	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
			
 
				+	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
			
 
				+	   reada.o backref.o
			
 
				 
			
 
				 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
			
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 
				 		if (!value)
			
 
				 			return ERR_PTR(-ENOMEM);
			
 
				 		size = __btrfs_getxattr(inode, name, value, size);
			
 
				-		if (size > 0) {
			
 
				-			acl = posix_acl_from_xattr(value, size);
			
 
				-			if (IS_ERR(acl)) {
			
 
				-				kfree(value);
			
 
				-				return acl;
			
 
				-			}
			
 
				-			set_cached_acl(inode, type, acl);
			
 
				-		}
			
 
				-		kfree(value);
			
 
				+	}
			
 
				+	if (size > 0) {
			
 
				+		acl = posix_acl_from_xattr(value, size);
			
 
				 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
			
 
				 		/* FIXME, who returns -ENOENT?  I think nobody */
			
 
				 		acl = NULL;
			
 
				-		set_cached_acl(inode, type, acl);
			
 
				 	} else {
			
 
				 		acl = ERR_PTR(-EIO);
			
 
				 	}
			
 
				+	kfree(value);
			
 
				+
			
 
				+	if (!IS_ERR(acl))
			
 
				+		set_cached_acl(inode, type, acl);
			
 
				 
			
 
				 	return acl;
			
 
				 }
			
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
 
				+/*
			
 
				+ * Copyright (C) 2011 STRATO.  All rights reserved.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public
			
 
				+ * License v2 as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public
			
 
				+ * License along with this program; if not, write to the
			
 
				+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
			
 
				+ * Boston, MA 021110-1307, USA.
			
 
				+ */
			
 
				+
			
 
				+#include "ctree.h"
			
 
				+#include "disk-io.h"
			
 
				+#include "backref.h"
			
 
				+
			
 
				+struct __data_ref {
			
 
				+	struct list_head list;
			
 
				+	u64 inum;
			
 
				+	u64 root;
			
 
				+	u64 extent_data_item_offset;
			
 
				+};
			
 
				+
			
 
				+struct __shared_ref {
			
 
				+	struct list_head list;
			
 
				+	u64 disk_byte;
			
 
				+};
			
 
				+
			
 
				+static int __inode_info(u64 inum, u64 ioff, u8 key_type,
			
 
				+			struct btrfs_root *fs_root, struct btrfs_path *path,
			
 
				+			struct btrfs_key *found_key)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct btrfs_key key;
			
 
				+	struct extent_buffer *eb;
			
 
				+
			
 
				+	key.type = key_type;
			
 
				+	key.objectid = inum;
			
 
				+	key.offset = ioff;
			
 
				+
			
 
				+	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	eb = path->nodes[0];
			
 
				+	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
			
 
				+		ret = btrfs_next_leaf(fs_root, path);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+		eb = path->nodes[0];
			
 
				+	}
			
 
				+
			
 
				+	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
			
 
				+	if (found_key->type != key.type || found_key->objectid != key.objectid)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this makes the path point to (inum INODE_ITEM ioff)
			
 
				+ */
			
 
				+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
			
 
				+			struct btrfs_path *path)
			
 
				+{
			
 
				+	struct btrfs_key key;
			
 
				+	return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
			
 
				+				&key);
			
 
				+}
			
 
				+
			
 
				+static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
			
 
				+				struct btrfs_path *path,
			
 
				+				struct btrfs_key *found_key)
			
 
				+{
			
 
				+	return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
			
 
				+				found_key);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
			
 
				+ * of the path are separated by '/' and the path is guaranteed to be
			
 
				+ * 0-terminated. the path is only given within the current file system.
			
 
				+ * Therefore, it never starts with a '/'. the caller is responsible to provide
			
 
				+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
			
 
				+ * the start point of the resulting string is returned. this pointer is within
			
 
				+ * dest, normally.
			
 
				+ * in case the path buffer would overflow, the pointer is decremented further
			
 
				+ * as if output was written to the buffer, though no more output is actually
			
 
				+ * generated. that way, the caller can determine how much space would be
			
 
				+ * required for the path to fit into the buffer. in that case, the returned
			
 
				+ * value will be smaller than dest. callers must check this!
			
 
				+ */
			
 
				+static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
			
 
				+				struct btrfs_inode_ref *iref,
			
 
				+				struct extent_buffer *eb_in, u64 parent,
			
 
				+				char *dest, u32 size)
			
 
				+{
			
 
				+	u32 len;
			
 
				+	int slot;
			
 
				+	u64 next_inum;
			
 
				+	int ret;
			
 
				+	s64 bytes_left = size - 1;
			
 
				+	struct extent_buffer *eb = eb_in;
			
 
				+	struct btrfs_key found_key;
			
 
				+
			
 
				+	if (bytes_left >= 0)
			
 
				+		dest[bytes_left] = '\0';
			
 
				+
			
 
				+	while (1) {
			
 
				+		len = btrfs_inode_ref_name_len(eb, iref);
			
 
				+		bytes_left -= len;
			
 
				+		if (bytes_left >= 0)
			
 
				+			read_extent_buffer(eb, dest + bytes_left,
			
 
				+						(unsigned long)(iref + 1), len);
			
 
				+		if (eb != eb_in)
			
 
				+			free_extent_buffer(eb);
			
 
				+		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
			
 
				+		if (ret)
			
 
				+			break;
			
 
				+		next_inum = found_key.offset;
			
 
				+
			
 
				+		/* regular exit ahead */
			
 
				+		if (parent == next_inum)
			
 
				+			break;
			
 
				+
			
 
				+		slot = path->slots[0];
			
 
				+		eb = path->nodes[0];
			
 
				+		/* make sure we can use eb after releasing the path */
			
 
				+		if (eb != eb_in)
			
 
				+			atomic_inc(&eb->refs);
			
 
				+		btrfs_release_path(path);
			
 
				+
			
 
				+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
			
 
				+		parent = next_inum;
			
 
				+		--bytes_left;
			
 
				+		if (bytes_left >= 0)
			
 
				+			dest[bytes_left] = '/';
			
 
				+	}
			
 
				+
			
 
				+	btrfs_release_path(path);
			
 
				+
			
 
				+	if (ret)
			
 
				+		return ERR_PTR(ret);
			
 
				+
			
 
				+	return dest + bytes_left;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this makes the path point to (logical EXTENT_ITEM *)
			
 
				+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
			
 
				+ * tree blocks and <0 on error.
			
 
				+ */
			
 
				+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
			
 
				+			struct btrfs_path *path, struct btrfs_key *found_key)
			
 
				+{
			
 
				+	int ret;
			
 
				+	u64 flags;
			
 
				+	u32 item_size;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_extent_item *ei;
			
 
				+	struct btrfs_key key;
			
 
				+
			
 
				+	key.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				+	key.objectid = logical;
			
 
				+	key.offset = (u64)-1;
			
 
				+
			
 
				+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+	ret = btrfs_previous_item(fs_info->extent_root, path,
			
 
				+					0, BTRFS_EXTENT_ITEM_KEY);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
			
 
				+	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
			
 
				+	    found_key->objectid > logical ||
			
 
				+	    found_key->objectid + found_key->offset <= logical)
			
 
				+		return -ENOENT;
			
 
				+
			
 
				+	eb = path->nodes[0];
			
 
				+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
			
 
				+	BUG_ON(item_size < sizeof(*ei));
			
 
				+
			
 
				+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
			
 
				+	flags = btrfs_extent_flags(eb, ei);
			
 
				+
			
 
				+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
			
 
				+		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
			
 
				+	if (flags & BTRFS_EXTENT_FLAG_DATA)
			
 
				+		return BTRFS_EXTENT_FLAG_DATA;
			
 
				+
			
 
				+	return -EIO;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * helper function to iterate extent inline refs. ptr must point to a 0 value
			
 
				+ * for the first call and may be modified. it is used to track state.
			
 
				+ * if more refs exist, 0 is returned and the next call to
			
 
				+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
			
 
				+ * next ref. after the last ref was processed, 1 is returned.
			
 
				+ * returns <0 on error
			
 
				+ */
			
 
				+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
			
 
				+				struct btrfs_extent_item *ei, u32 item_size,
			
 
				+				struct btrfs_extent_inline_ref **out_eiref,
			
 
				+				int *out_type)
			
 
				+{
			
 
				+	unsigned long end;
			
 
				+	u64 flags;
			
 
				+	struct btrfs_tree_block_info *info;
			
 
				+
			
 
				+	if (!*ptr) {
			
 
				+		/* first call */
			
 
				+		flags = btrfs_extent_flags(eb, ei);
			
 
				+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			
 
				+			info = (struct btrfs_tree_block_info *)(ei + 1);
			
 
				+			*out_eiref =
			
 
				+				(struct btrfs_extent_inline_ref *)(info + 1);
			
 
				+		} else {
			
 
				+			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
			
 
				+		}
			
 
				+		*ptr = (unsigned long)*out_eiref;
			
 
				+		if ((void *)*ptr >= (void *)ei + item_size)
			
 
				+			return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	end = (unsigned long)ei + item_size;
			
 
				+	*out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
			
 
				+	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
			
 
				+
			
 
				+	*ptr += btrfs_extent_inline_ref_size(*out_type);
			
 
				+	WARN_ON(*ptr > end);
			
 
				+	if (*ptr == end)
			
 
				+		return 1; /* last */
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * reads the tree block backref for an extent. tree level and root are returned
			
 
				+ * through out_level and out_root. ptr must point to a 0 value for the first
			
 
				+ * call and may be modified (see __get_extent_inline_ref comment).
			
 
				+ * returns 0 if data was provided, 1 if there was no more data to provide or
			
 
				+ * <0 on error.
			
 
				+ */
			
 
				+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
			
 
				+				struct btrfs_extent_item *ei, u32 item_size,
			
 
				+				u64 *out_root, u8 *out_level)
			
 
				+{
			
 
				+	int ret;
			
 
				+	int type;
			
 
				+	struct btrfs_tree_block_info *info;
			
 
				+	struct btrfs_extent_inline_ref *eiref;
			
 
				+
			
 
				+	if (*ptr == (unsigned long)-1)
			
 
				+		return 1;
			
 
				+
			
 
				+	while (1) {
			
 
				+		ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
			
 
				+						&eiref, &type);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+
			
 
				+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
			
 
				+		    type == BTRFS_SHARED_BLOCK_REF_KEY)
			
 
				+			break;
			
 
				+
			
 
				+		if (ret == 1)
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	/* we can treat both ref types equally here */
			
 
				+	info = (struct btrfs_tree_block_info *)(ei + 1);
			
 
				+	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
			
 
				+	*out_level = btrfs_tree_block_level(eb, info);
			
 
				+
			
 
				+	if (ret == 1)
			
 
				+		*ptr = (unsigned long)-1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __data_list_add(struct list_head *head, u64 inum,
			
 
				+				u64 extent_data_item_offset, u64 root)
			
 
				+{
			
 
				+	struct __data_ref *ref;
			
 
				+
			
 
				+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
			
 
				+	if (!ref)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ref->inum = inum;
			
 
				+	ref->extent_data_item_offset = extent_data_item_offset;
			
 
				+	ref->root = root;
			
 
				+	list_add_tail(&ref->list, head);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
			
 
				+				struct btrfs_extent_data_ref *dref)
			
 
				+{
			
 
				+	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
			
 
				+				btrfs_extent_data_ref_offset(eb, dref),
			
 
				+				btrfs_extent_data_ref_root(eb, dref));
			
 
				+}
			
 
				+
			
 
				+static int __shared_list_add(struct list_head *head, u64 disk_byte)
			
 
				+{
			
 
				+	struct __shared_ref *ref;
			
 
				+
			
 
				+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
			
 
				+	if (!ref)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ref->disk_byte = disk_byte;
			
 
				+	list_add_tail(&ref->list, head);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
			
 
				+					   u64 logical, u64 inum,
			
 
				+					   u64 extent_data_item_offset,
			
 
				+					   u64 extent_offset,
			
 
				+					   struct btrfs_path *path,
			
 
				+					   struct list_head *data_refs,
			
 
				+					   iterate_extent_inodes_t *iterate,
			
 
				+					   void *ctx)
			
 
				+{
			
 
				+	u64 ref_root;
			
 
				+	u32 item_size;
			
 
				+	struct btrfs_key key;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_extent_item *ei;
			
 
				+	struct btrfs_extent_inline_ref *eiref;
			
 
				+	struct __data_ref *ref;
			
 
				+	int ret;
			
 
				+	int type;
			
 
				+	int last;
			
 
				+	unsigned long ptr = 0;
			
 
				+
			
 
				+	WARN_ON(!list_empty(data_refs));
			
 
				+	ret = extent_from_logical(fs_info, logical, path, &key);
			
 
				+	if (ret & BTRFS_EXTENT_FLAG_DATA)
			
 
				+		ret = -EIO;
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	eb = path->nodes[0];
			
 
				+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
			
 
				+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
			
 
				+
			
 
				+	ret = 0;
			
 
				+	ref_root = 0;
			
 
				+	/*
			
 
				+	 * as done in iterate_extent_inodes, we first build a list of refs to
			
 
				+	 * iterate, then free the path and then iterate them to avoid deadlocks.
			
 
				+	 */
			
 
				+	do {
			
 
				+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
			
 
				+						&eiref, &type);
			
 
				+		if (last < 0) {
			
 
				+			ret = last;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
			
 
				+		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
			
 
				+			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
			
 
				+			ret = __data_list_add(data_refs, inum,
			
 
				+						extent_data_item_offset,
			
 
				+						ref_root);
			
 
				+		}
			
 
				+	} while (!ret && !last);
			
 
				+
			
 
				+	btrfs_release_path(path);
			
 
				+
			
 
				+	if (ref_root == 0) {
			
 
				+		printk(KERN_ERR "btrfs: failed to find tree block ref "
			
 
				+			"for shared data backref %llu\n", logical);
			
 
				+		WARN_ON(1);
			
 
				+		ret = -EIO;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	while (!list_empty(data_refs)) {
			
 
				+		ref = list_first_entry(data_refs, struct __data_ref, list);
			
 
				+		list_del(&ref->list);
			
 
				+		if (!ret)
			
 
				+			ret = iterate(ref->inum, extent_offset +
			
 
				+					ref->extent_data_item_offset,
			
 
				+					ref->root, ctx);
			
 
				+		kfree(ref);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
			
 
				+				    u64 logical, u64 orig_extent_item_objectid,
			
 
				+				    u64 extent_offset, struct btrfs_path *path,
			
 
				+				    struct list_head *data_refs,
			
 
				+				    iterate_extent_inodes_t *iterate,
			
 
				+				    void *ctx)
			
 
				+{
			
 
				+	u64 disk_byte;
			
 
				+	struct btrfs_key key;
			
 
				+	struct btrfs_file_extent_item *fi;
			
 
				+	struct extent_buffer *eb;
			
 
				+	int slot;
			
 
				+	int nritems;
			
 
				+	int ret;
			
 
				+	int found = 0;
			
 
				+
			
 
				+	eb = read_tree_block(fs_info->tree_root, logical,
			
 
				+				fs_info->tree_root->leafsize, 0);
			
 
				+	if (!eb)
			
 
				+		return -EIO;
			
 
				+
			
 
				+	/*
			
 
				+	 * from the shared data ref, we only have the leaf but we need
			
 
				+	 * the key. thus, we must look into all items and see that we
			
 
				+	 * find one (some) with a reference to our extent item.
			
 
				+	 */
			
 
				+	nritems = btrfs_header_nritems(eb);
			
 
				+	for (slot = 0; slot < nritems; ++slot) {
			
 
				+		btrfs_item_key_to_cpu(eb, &key, slot);
			
 
				+		if (key.type != BTRFS_EXTENT_DATA_KEY)
			
 
				+			continue;
			
 
				+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
			
 
				+		if (!fi) {
			
 
				+			free_extent_buffer(eb);
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
			
 
				+		if (disk_byte != orig_extent_item_objectid) {
			
 
				+			if (found)
			
 
				+				break;
			
 
				+			else
			
 
				+				continue;
			
 
				+		}
			
 
				+		++found;
			
 
				+		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
			
 
				+							key.objectid,
			
 
				+							key.offset,
			
 
				+							extent_offset, path,
			
 
				+							data_refs,
			
 
				+							iterate, ctx);
			
 
				+		if (ret)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	if (!found) {
			
 
				+		printk(KERN_ERR "btrfs: failed to follow shared data backref "
			
 
				+			"to parent %llu\n", logical);
			
 
				+		WARN_ON(1);
			
 
				+		ret = -EIO;
			
 
				+	}
			
 
				+
			
 
				+	free_extent_buffer(eb);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * calls iterate() for every inode that references the extent identified by
			
 
				+ * the given parameters. will use the path given as a parameter and return it
			
 
				+ * released.
			
 
				+ * when the iterator function returns a non-zero value, iteration stops.
			
 
				+ */
			
 
				+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
			
 
				+				struct btrfs_path *path,
			
 
				+				u64 extent_item_objectid,
			
 
				+				u64 extent_offset,
			
 
				+				iterate_extent_inodes_t *iterate, void *ctx)
			
 
				+{
			
 
				+	unsigned long ptr = 0;
			
 
				+	int last;
			
 
				+	int ret;
			
 
				+	int type;
			
 
				+	u64 logical;
			
 
				+	u32 item_size;
			
 
				+	struct btrfs_extent_inline_ref *eiref;
			
 
				+	struct btrfs_extent_data_ref *dref;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_extent_item *ei;
			
 
				+	struct btrfs_key key;
			
 
				+	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
			
 
				+	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
			
 
				+	struct __data_ref *ref_d;
			
 
				+	struct __shared_ref *ref_s;
			
 
				+
			
 
				+	eb = path->nodes[0];
			
 
				+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
			
 
				+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
			
 
				+
			
 
				+	/* first we iterate the inline refs, ... */
			
 
				+	do {
			
 
				+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
			
 
				+						&eiref, &type);
			
 
				+		if (last == -ENOENT) {
			
 
				+			ret = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+		if (last < 0) {
			
 
				+			ret = last;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			
 
				+			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
			
 
				+			ret = __data_list_add_eb(&data_refs, eb, dref);
			
 
				+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
			
 
				+			logical = btrfs_extent_inline_ref_offset(eb, eiref);
			
 
				+			ret = __shared_list_add(&shared_refs, logical);
			
 
				+		}
			
 
				+	} while (!ret && !last);
			
 
				+
			
 
				+	/* ... then we proceed to in-tree references and ... */
			
 
				+	while (!ret) {
			
 
				+		++path->slots[0];
			
 
				+		if (path->slots[0] > btrfs_header_nritems(eb)) {
			
 
				+			ret = btrfs_next_leaf(fs_info->extent_root, path);
			
 
				+			if (ret) {
			
 
				+				if (ret == 1)
			
 
				+					ret = 0; /* we're done */
			
 
				+				break;
			
 
				+			}
			
 
				+			eb = path->nodes[0];
			
 
				+		}
			
 
				+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
			
 
				+		if (key.objectid != extent_item_objectid)
			
 
				+			break;
			
 
				+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
			
 
				+			dref = btrfs_item_ptr(eb, path->slots[0],
			
 
				+						struct btrfs_extent_data_ref);
			
 
				+			ret = __data_list_add_eb(&data_refs, eb, dref);
			
 
				+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
			
 
				+			ret = __shared_list_add(&shared_refs, key.offset);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	btrfs_release_path(path);
			
 
				+
			
 
				+	/*
			
 
				+	 * ... only at the very end we can process the refs we found. this is
			
 
				+	 * because the iterator function we call is allowed to make tree lookups
			
 
				+	 * and we have to avoid deadlocks. additionally, we need more tree
			
 
				+	 * lookups ourselves for shared data refs.
			
 
				+	 */
			
 
				+	while (!list_empty(&data_refs)) {
			
 
				+		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
			
 
				+		list_del(&ref_d->list);
			
 
				+		if (!ret)
			
 
				+			ret = iterate(ref_d->inum, extent_offset +
			
 
				+					ref_d->extent_data_item_offset,
			
 
				+					ref_d->root, ctx);
			
 
				+		kfree(ref_d);
			
 
				+	}
			
 
				+
			
 
				+	while (!list_empty(&shared_refs)) {
			
 
				+		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
			
 
				+					list);
			
 
				+		list_del(&ref_s->list);
			
 
				+		if (!ret)
			
 
				+			ret = __iter_shared_inline_ref(fs_info,
			
 
				+							ref_s->disk_byte,
			
 
				+							extent_item_objectid,
			
 
				+							extent_offset, path,
			
 
				+							&data_refs,
			
 
				+							iterate, ctx);
			
 
				+		kfree(ref_s);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
			
 
				+				struct btrfs_path *path,
			
 
				+				iterate_extent_inodes_t *iterate, void *ctx)
			
 
				+{
			
 
				+	int ret;
			
 
				+	u64 offset;
			
 
				+	struct btrfs_key found_key;
			
 
				+
			
 
				+	ret = extent_from_logical(fs_info, logical, path,
			
 
				+					&found_key);
			
 
				+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
			
 
				+		ret = -EINVAL;
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	offset = logical - found_key.objectid;
			
 
				+	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
			
 
				+					offset, iterate, ctx);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
			
 
				+				struct btrfs_path *path,
			
 
				+				iterate_irefs_t *iterate, void *ctx)
			
 
				+{
			
 
				+	int ret;
			
 
				+	int slot;
			
 
				+	u32 cur;
			
 
				+	u32 len;
			
 
				+	u32 name_len;
			
 
				+	u64 parent = 0;
			
 
				+	int found = 0;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_item *item;
			
 
				+	struct btrfs_inode_ref *iref;
			
 
				+	struct btrfs_key found_key;
			
 
				+
			
 
				+	while (1) {
			
 
				+		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
			
 
				+					&found_key);
			
 
				+		if (ret < 0)
			
 
				+			break;
			
 
				+		if (ret) {
			
 
				+			ret = found ? 0 : -ENOENT;
			
 
				+			break;
			
 
				+		}
			
 
				+		++found;
			
 
				+
			
 
				+		parent = found_key.offset;
			
 
				+		slot = path->slots[0];
			
 
				+		eb = path->nodes[0];
			
 
				+		/* make sure we can use eb after releasing the path */
			
 
				+		atomic_inc(&eb->refs);
			
 
				+		btrfs_release_path(path);
			
 
				+
			
 
				+		item = btrfs_item_nr(eb, slot);
			
 
				+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
			
 
				+
			
 
				+		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
			
 
				+			name_len = btrfs_inode_ref_name_len(eb, iref);
			
 
				+			/* path must be released before calling iterate()! */
			
 
				+			ret = iterate(parent, iref, eb, ctx);
			
 
				+			if (ret) {
			
 
				+				free_extent_buffer(eb);
			
 
				+				break;
			
 
				+			}
			
 
				+			len = sizeof(*iref) + name_len;
			
 
				+			iref = (struct btrfs_inode_ref *)((char *)iref + len);
			
 
				+		}
			
 
				+		free_extent_buffer(eb);
			
 
				+	}
			
 
				+
			
 
				+	btrfs_release_path(path);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * returns 0 if the path could be dumped (probably truncated)
			
 
				+ * returns <0 in case of an error
			
 
				+ */
			
 
				+static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
			
 
				+				struct extent_buffer *eb, void *ctx)
			
 
				+{
			
 
				+	struct inode_fs_paths *ipath = ctx;
			
 
				+	char *fspath;
			
 
				+	char *fspath_min;
			
 
				+	int i = ipath->fspath->elem_cnt;
			
 
				+	const int s_ptr = sizeof(char *);
			
 
				+	u32 bytes_left;
			
 
				+
			
 
				+	bytes_left = ipath->fspath->bytes_left > s_ptr ?
			
 
				+					ipath->fspath->bytes_left - s_ptr : 0;
			
 
				+
			
 
				+	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
			
 
				+	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
			
 
				+				inum, fspath_min, bytes_left);
			
 
				+	if (IS_ERR(fspath))
			
 
				+		return PTR_ERR(fspath);
			
 
				+
			
 
				+	if (fspath > fspath_min) {
			
 
				+		ipath->fspath->val[i] = (u64)fspath;
			
 
				+		++ipath->fspath->elem_cnt;
			
 
				+		ipath->fspath->bytes_left = fspath - fspath_min;
			
 
				+	} else {
			
 
				+		++ipath->fspath->elem_missed;
			
 
				+		ipath->fspath->bytes_missing += fspath_min - fspath;
			
 
				+		ipath->fspath->bytes_left = 0;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this dumps all file system paths to the inode into the ipath struct, provided
			
 
				+ * is has been created large enough. each path is zero-terminated and accessed
			
 
				+ * from ipath->fspath->val[i].
			
 
				+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
			
 
				+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
			
 
				+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
			
 
				+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
			
 
				+ * have been needed to return all paths.
			
 
				+ */
			
 
				+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
			
 
				+{
			
 
				+	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
			
 
				+				inode_to_path, ipath);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * allocates space to return multiple file system paths for an inode.
			
 
				+ * total_bytes to allocate are passed, note that space usable for actual path
			
 
				+ * information will be total_bytes - sizeof(struct inode_fs_paths).
			
 
				+ * the returned pointer must be freed with free_ipath() in the end.
			
 
				+ */
			
 
				+struct btrfs_data_container *init_data_container(u32 total_bytes)
			
 
				+{
			
 
				+	struct btrfs_data_container *data;
			
 
				+	size_t alloc_bytes;
			
 
				+
			
 
				+	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
			
 
				+	data = kmalloc(alloc_bytes, GFP_NOFS);
			
 
				+	if (!data)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	if (total_bytes >= sizeof(*data)) {
			
 
				+		data->bytes_left = total_bytes - sizeof(*data);
			
 
				+		data->bytes_missing = 0;
			
 
				+	} else {
			
 
				+		data->bytes_missing = sizeof(*data) - total_bytes;
			
 
				+		data->bytes_left = 0;
			
 
				+	}
			
 
				+
			
 
				+	data->elem_cnt = 0;
			
 
				+	data->elem_missed = 0;
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * allocates space to return multiple file system paths for an inode.
			
 
				+ * total_bytes to allocate are passed, note that space usable for actual path
			
 
				+ * information will be total_bytes - sizeof(struct inode_fs_paths).
			
 
				+ * the returned pointer must be freed with free_ipath() in the end.
			
 
				+ */
			
 
				+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
			
 
				+					struct btrfs_path *path)
			
 
				+{
			
 
				+	struct inode_fs_paths *ifp;
			
 
				+	struct btrfs_data_container *fspath;
			
 
				+
			
 
				+	fspath = init_data_container(total_bytes);
			
 
				+	if (IS_ERR(fspath))
			
 
				+		return (void *)fspath;
			
 
				+
			
 
				+	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
			
 
				+	if (!ifp) {
			
 
				+		kfree(fspath);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+
			
 
				+	ifp->btrfs_path = path;
			
 
				+	ifp->fspath = fspath;
			
 
				+	ifp->fs_root = fs_root;
			
 
				+
			
 
				+	return ifp;
			
 
				+}
			
 
				+
			
 
				+void free_ipath(struct inode_fs_paths *ipath)
			
 
				+{
			
 
				+	kfree(ipath);
			
 
				+}
			
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
 
				+/*
			
 
				+ * Copyright (C) 2011 STRATO.  All rights reserved.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public
			
 
				+ * License v2 as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public
			
 
				+ * License along with this program; if not, write to the
			
 
				+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
			
 
				+ * Boston, MA 021110-1307, USA.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BTRFS_BACKREF__
			
 
				+#define __BTRFS_BACKREF__
			
 
				+
			
 
				+#include "ioctl.h"
			
 
				+
			
 
				+struct inode_fs_paths {
			
 
				+	struct btrfs_path		*btrfs_path;
			
 
				+	struct btrfs_root		*fs_root;
			
 
				+	struct btrfs_data_container	*fspath;
			
 
				+};
			
 
				+
			
 
				+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
			
 
				+		void *ctx);
			
 
				+typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
			
 
				+				struct extent_buffer *eb, void *ctx);
			
 
				+
			
 
				+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
			
 
				+			struct btrfs_path *path);
			
 
				+
			
 
				+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
			
 
				+			struct btrfs_path *path, struct btrfs_key *found_key);
			
 
				+
			
 
				+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
			
 
				+				struct btrfs_extent_item *ei, u32 item_size,
			
 
				+				u64 *out_root, u8 *out_level);
			
 
				+
			
 
				+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
			
 
				+				struct btrfs_path *path,
			
 
				+				u64 extent_item_objectid,
			
 
				+				u64 extent_offset,
			
 
				+				iterate_extent_inodes_t *iterate, void *ctx);
			
 
				+
			
 
				+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
			
 
				+				struct btrfs_path *path,
			
 
				+				iterate_extent_inodes_t *iterate, void *ctx);
			
 
				+
			
 
				+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
			
 
				+
			
 
				+struct btrfs_data_container *init_data_container(u32 total_bytes);
			
 
				+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
			
 
				+					struct btrfs_path *path);
			
 
				+void free_ipath(struct inode_fs_paths *ipath);
			
 
				+
			
 
				+#endif
			
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
 
				 	 */
			
 
				 	u64 delalloc_bytes;
			
 
				 
			
 
				-	/* total number of bytes that may be used for this inode for
			
 
				-	 * delalloc
			
 
				-	 */
			
 
				-	u64 reserved_bytes;
			
 
				-
			
 
				 	/*
			
 
				 	 * the size of the file stored in the metadata on disk.  data=ordered
			
 
				 	 * means the in-memory i_size might be larger than the size on disk
			
@@ -115,9 +110,6 @@ struct btrfs_inode {
 
				 	 */
			
 
				 	u64 disk_i_size;
			
 
				 
			
 
				-	/* flags field from the on disk inode */
			
 
				-	u32 flags;
			
 
				-
			
 
				 	/*
			
 
				 	 * if this is a directory then index_cnt is the counter for the index
			
 
				 	 * number for new files that are created
			
@@ -131,6 +123,15 @@ struct btrfs_inode {
 
				 	 */
			
 
				 	u64 last_unlink_trans;
			
 
				 
			
 
				+	/*
			
 
				+	 * Number of bytes outstanding that are going to need csums.  This is
			
 
				+	 * used in ENOSPC accounting.
			
 
				+	 */
			
 
				+	u64 csum_bytes;
			
 
				+
			
 
				+	/* flags field from the on disk inode */
			
 
				+	u32 flags;
			
 
				+
			
 
				 	/*
			
 
				 	 * Counters to keep track of the number of extent item's we may use due
			
 
				 	 * to delalloc and such.  outstanding_extents is the number of extent
			
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
 
				 static inline int compressed_bio_size(struct btrfs_root *root,
			
 
				 				      unsigned long disk_size)
			
 
				 {
			
 
				-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				+
			
 
				 	return sizeof(struct compressed_bio) +
			
 
				 		((disk_size + root->sectorsize - 1) / root->sectorsize) *
			
 
				 		csum_size;
			
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
			
 
				 
			
 
				-	if (level < BTRFS_MAX_LEVEL - 1)
			
 
				+	if (level < BTRFS_MAX_LEVEL - 1) {
			
 
				 		parent = path->nodes[level + 1];
			
 
				-	pslot = path->slots[level + 1];
			
 
				+		pslot = path->slots[level + 1];
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * deal with the case where there is only one pointer in the root
			
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 
				 	mid = path->nodes[level];
			
 
				 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
			
 
				 
			
 
				-	if (level < BTRFS_MAX_LEVEL - 1)
			
 
				+	if (level < BTRFS_MAX_LEVEL - 1) {
			
 
				 		parent = path->nodes[level + 1];
			
 
				-	pslot = path->slots[level + 1];
			
 
				+		pslot = path->slots[level + 1];
			
 
				+	}
			
 
				 
			
 
				 	if (!parent)
			
 
				 		return 1;
			
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 
				 #include <linux/kobject.h>
			
 
				 #include <trace/events/btrfs.h>
			
 
				 #include <asm/kmap_types.h>
			
 
				+#include <linux/pagemap.h>
			
 
				 #include "extent_io.h"
			
 
				 #include "extent_map.h"
			
 
				 #include "async-thread.h"
			
@@ -359,6 +360,47 @@ struct btrfs_header {
 
				 #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
			
 
				 #define BTRFS_LABEL_SIZE 256
			
 
				 
			
 
				+/*
			
 
				+ * just in case we somehow lose the roots and are not able to mount,
			
 
				+ * we store an array of the roots from previous transactions
			
 
				+ * in the super.
			
 
				+ */
			
 
				+#define BTRFS_NUM_BACKUP_ROOTS 4
			
 
				+struct btrfs_root_backup {
			
 
				+	__le64 tree_root;
			
 
				+	__le64 tree_root_gen;
			
 
				+
			
 
				+	__le64 chunk_root;
			
 
				+	__le64 chunk_root_gen;
			
 
				+
			
 
				+	__le64 extent_root;
			
 
				+	__le64 extent_root_gen;
			
 
				+
			
 
				+	__le64 fs_root;
			
 
				+	__le64 fs_root_gen;
			
 
				+
			
 
				+	__le64 dev_root;
			
 
				+	__le64 dev_root_gen;
			
 
				+
			
 
				+	__le64 csum_root;
			
 
				+	__le64 csum_root_gen;
			
 
				+
			
 
				+	__le64 total_bytes;
			
 
				+	__le64 bytes_used;
			
 
				+	__le64 num_devices;
			
 
				+	/* future */
			
 
				+	__le64 unsed_64[4];
			
 
				+
			
 
				+	u8 tree_root_level;
			
 
				+	u8 chunk_root_level;
			
 
				+	u8 extent_root_level;
			
 
				+	u8 fs_root_level;
			
 
				+	u8 dev_root_level;
			
 
				+	u8 csum_root_level;
			
 
				+	/* future and to align */
			
 
				+	u8 unused_8[10];
			
 
				+} __attribute__ ((__packed__));
			
 
				+
			
 
				 /*
			
 
				  * the super block basically lists the main trees of the FS
			
 
				  * it currently lacks any block count etc etc
			
@@ -405,6 +447,7 @@ struct btrfs_super_block {
 
				 	/* future expansion */
			
 
				 	__le64 reserved[31];
			
 
				 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
			
 
				+	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
			
 
				 } __attribute__ ((__packed__));
			
 
				 
			
 
				 /*
			
@@ -772,14 +815,8 @@ struct btrfs_space_info {
 
				 struct btrfs_block_rsv {
			
 
				 	u64 size;
			
 
				 	u64 reserved;
			
 
				-	u64 freed[2];
			
 
				 	struct btrfs_space_info *space_info;
			
 
				-	struct list_head list;
			
 
				 	spinlock_t lock;
			
 
				-	atomic_t usage;
			
 
				-	unsigned int priority:8;
			
 
				-	unsigned int durable:1;
			
 
				-	unsigned int refill_used:1;
			
 
				 	unsigned int full:1;
			
 
				 };
			
 
				 
			
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
 
				 	spinlock_t lock;
			
 
				 	u64 pinned;
			
 
				 	u64 reserved;
			
 
				-	u64 reserved_pinned;
			
 
				 	u64 bytes_super;
			
 
				 	u64 flags;
			
 
				 	u64 sectorsize;
			
 
				+	u64 cache_generation;
			
 
				 	unsigned int ro:1;
			
 
				 	unsigned int dirty:1;
			
 
				 	unsigned int iref:1;
			
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
 
				 	spinlock_t block_group_cache_lock;
			
 
				 	struct rb_root block_group_cache_tree;
			
 
				 
			
 
				+	/* keep track of unallocated space */
			
 
				+	spinlock_t free_chunk_lock;
			
 
				+	u64 free_chunk_space;
			
 
				+
			
 
				 	struct extent_io_tree freed_extents[2];
			
 
				 	struct extent_io_tree *pinned_extents;
			
 
				 
			
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
 
				 	struct btrfs_block_rsv trans_block_rsv;
			
 
				 	/* block reservation for chunk tree */
			
 
				 	struct btrfs_block_rsv chunk_block_rsv;
			
 
				+	/* block reservation for delayed operations */
			
 
				+	struct btrfs_block_rsv delayed_block_rsv;
			
 
				 
			
 
				 	struct btrfs_block_rsv empty_block_rsv;
			
 
				 
			
 
				-	/* list of block reservations that cross multiple transactions */
			
 
				-	struct list_head durable_block_rsv_list;
			
 
				-
			
 
				-	struct mutex durable_block_rsv_mutex;
			
 
				-
			
 
				 	u64 generation;
			
 
				 	u64 last_trans_committed;
			
 
				 
			
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
 
				 	wait_queue_head_t transaction_blocked_wait;
			
 
				 	wait_queue_head_t async_submit_wait;
			
 
				 
			
 
				-	struct btrfs_super_block super_copy;
			
 
				-	struct btrfs_super_block super_for_commit;
			
 
				+	struct btrfs_super_block *super_copy;
			
 
				+	struct btrfs_super_block *super_for_commit;
			
 
				 	struct block_device *__bdev;
			
 
				 	struct super_block *sb;
			
 
				 	struct inode *btree_inode;
			
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
 
				 	struct btrfs_workers endio_freespace_worker;
			
 
				 	struct btrfs_workers submit_workers;
			
 
				 	struct btrfs_workers caching_workers;
			
 
				+	struct btrfs_workers readahead_workers;
			
 
				 
			
 
				 	/*
			
 
				 	 * fixup workers take dirty pages that didn't properly go through
			
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
 
				 	u64 fs_state;
			
 
				 
			
 
				 	struct btrfs_delayed_root *delayed_root;
			
 
				+
			
 
				+	/* readahead tree */
			
 
				+	spinlock_t reada_lock;
			
 
				+	struct radix_tree_root reada_tree;
			
 
				+
			
 
				+	/* next backup root to be overwritten */
			
 
				+	int backup_root_index;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
 
				 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
			
 
				 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
			
 
				 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
			
 
				+#define BTRFS_MOUNT_RECOVERY		(1 << 18)
			
 
				 
			
 
				 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
			
 
				 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
			
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
 
				 	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
			
 
				 }
			
 
				 
			
 
				+/* struct btrfs_root_backup */
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
			
 
				+		   tree_root, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
			
 
				+		   tree_root_gen, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
			
 
				+		   tree_root_level, 8);
			
 
				+
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
			
 
				+		   chunk_root, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
			
 
				+		   chunk_root_gen, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
			
 
				+		   chunk_root_level, 8);
			
 
				+
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
			
 
				+		   extent_root, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
			
 
				+		   extent_root_gen, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
			
 
				+		   extent_root_level, 8);
			
 
				+
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
			
 
				+		   fs_root, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
			
 
				+		   fs_root_gen, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
			
 
				+		   fs_root_level, 8);
			
 
				+
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
			
 
				+		   dev_root, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
			
 
				+		   dev_root_gen, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
			
 
				+		   dev_root_level, 8);
			
 
				+
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
			
 
				+		   csum_root, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
			
 
				+		   csum_root_gen, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
			
 
				+		   csum_root_level, 8);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
			
 
				+		   total_bytes, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
			
 
				+		   bytes_used, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
			
 
				+		   num_devices, 64);
			
 
				+
			
 
				 /* struct btrfs_super_block */
			
 
				 
			
 
				 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
			
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 
				 		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
			
 
				 }
			
 
				 
			
 
				+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
			
 
				+{
			
 
				+	return mapping_gfp_mask(mapping) & ~__GFP_FS;
			
 
				+}
			
 
				+
			
 
				 /* extent-tree.c */
			
 
				 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
			
 
				 						 unsigned num_items)
			
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 
				 		3 * num_items;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Doing a truncate won't result in new nodes or leaves, just what we need for
			
 
				+ * COW.
			
 
				+ */
			
 
				+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
			
 
				+						 unsigned num_items)
			
 
				+{
			
 
				+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
			
 
				+		num_items;
			
 
				+}
			
 
				+
			
 
				 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
			
 
				 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
			
 
				 			   struct btrfs_root *root, unsigned long count);
			
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 
				 			     u64 num_bytes, u64 *refs, u64 *flags);
			
 
				 int btrfs_pin_extent(struct btrfs_root *root,
			
 
				 		     u64 bytenr, u64 num, int reserved);
			
 
				+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
			
 
				+				    struct btrfs_root *root,
			
 
				+				    u64 bytenr, u64 num_bytes);
			
 
				 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
			
 
				 			  struct btrfs_root *root,
			
 
				 			  u64 objectid, u64 offset, u64 bytenr);
			
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 
				 		      u64 root_objectid, u64 owner, u64 offset);
			
 
				 
			
 
				 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
			
 
				-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
			
 
				-				u64 num_bytes, int reserve, int sinfo);
			
 
				+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
			
 
				+				       u64 start, u64 len);
			
 
				 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
			
 
				 				struct btrfs_root *root);
			
 
				 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
			
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 
				 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
			
 
				 void btrfs_free_block_rsv(struct btrfs_root *root,
			
 
				 			  struct btrfs_block_rsv *rsv);
			
 
				-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
			
 
				-				 struct btrfs_block_rsv *rsv);
			
 
				-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
			
 
				-			struct btrfs_root *root,
			
 
				+int btrfs_block_rsv_add(struct btrfs_root *root,
			
 
				 			struct btrfs_block_rsv *block_rsv,
			
 
				 			u64 num_bytes);
			
 
				-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
			
 
				-			  struct btrfs_root *root,
			
 
				+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
			
 
				+				struct btrfs_block_rsv *block_rsv,
			
 
				+				u64 num_bytes);
			
 
				+int btrfs_block_rsv_check(struct btrfs_root *root,
			
 
				+			  struct btrfs_block_rsv *block_rsv, int min_factor);
			
 
				+int btrfs_block_rsv_refill(struct btrfs_root *root,
			
 
				 			  struct btrfs_block_rsv *block_rsv,
			
 
				-			  u64 min_reserved, int min_factor);
			
 
				+			  u64 min_reserved);
			
 
				 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
			
 
				 			    struct btrfs_block_rsv *dst_rsv,
			
 
				 			    u64 num_bytes);
			
 
				 void btrfs_block_rsv_release(struct btrfs_root *root,
			
 
				 			     struct btrfs_block_rsv *block_rsv,
			
 
				 			     u64 num_bytes);
			
 
				-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
			
 
				-				    struct btrfs_root *root,
			
 
				-				    struct btrfs_block_rsv *rsv);
			
 
				 int btrfs_set_block_group_ro(struct btrfs_root *root,
			
 
				 			     struct btrfs_block_group_cache *cache);
			
 
				 int btrfs_set_block_group_rw(struct btrfs_root *root,
			
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 
				 	smp_mb();
			
 
				 	return fs_info->closing;
			
 
				 }
			
 
				+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	kfree(fs_info->delayed_root);
			
 
				+	kfree(fs_info->extent_root);
			
 
				+	kfree(fs_info->tree_root);
			
 
				+	kfree(fs_info->chunk_root);
			
 
				+	kfree(fs_info->dev_root);
			
 
				+	kfree(fs_info->csum_root);
			
 
				+	kfree(fs_info->super_copy);
			
 
				+	kfree(fs_info->super_for_commit);
			
 
				+	kfree(fs_info);
			
 
				+}
			
 
				 
			
 
				 /* root-item.c */
			
 
				 int btrfs_find_root_ref(struct btrfs_root *tree_root,
			
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
				 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
			
 
				 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
			
 
				 int btrfs_orphan_cleanup(struct btrfs_root *root);
			
 
				-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
			
 
				-				struct btrfs_pending_snapshot *pending,
			
 
				-				u64 *bytes_to_reserve);
			
 
				-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
			
 
				-				struct btrfs_pending_snapshot *pending);
			
 
				 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
			
 
				 			      struct btrfs_root *root);
			
 
				 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
			
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 
				 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
			
 
				 			 struct btrfs_scrub_progress *progress);
			
 
				 
			
 
				+/* reada.c */
			
 
				+struct reada_control {
			
 
				+	struct btrfs_root	*root;		/* tree to prefetch */
			
 
				+	struct btrfs_key	key_start;
			
 
				+	struct btrfs_key	key_end;	/* exclusive */
			
 
				+	atomic_t		elems;
			
 
				+	struct kref		refcnt;
			
 
				+	wait_queue_head_t	wait;
			
 
				+};
			
 
				+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
			
 
				+			      struct btrfs_key *start, struct btrfs_key *end);
			
 
				+int btrfs_reada_wait(void *handle);
			
 
				+void btrfs_reada_detach(void *handle);
			
 
				+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				+			 u64 start, int err);
			
 
				+
			
 
				 #endif
			
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
				 		return 0;
			
 
				 
			
 
				 	src_rsv = trans->block_rsv;
			
 
				-	dst_rsv = &root->fs_info->global_block_rsv;
			
 
				+	dst_rsv = &root->fs_info->delayed_block_rsv;
			
 
				 
			
 
				 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
			
 
				 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
			
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 
				 	if (!item->bytes_reserved)
			
 
				 		return;
			
 
				 
			
 
				-	rsv = &root->fs_info->global_block_rsv;
			
 
				+	rsv = &root->fs_info->delayed_block_rsv;
			
 
				 	btrfs_block_rsv_release(root, rsv,
			
 
				 				item->bytes_reserved);
			
 
				 }
			
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
 
				 	u64 num_bytes;
			
 
				 	int ret;
			
 
				 
			
 
				-	if (!trans->bytes_reserved)
			
 
				-		return 0;
			
 
				-
			
 
				 	src_rsv = trans->block_rsv;
			
 
				-	dst_rsv = &root->fs_info->global_block_rsv;
			
 
				+	dst_rsv = &root->fs_info->delayed_block_rsv;
			
 
				 
			
 
				 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
			
 
				+
			
 
				+	/*
			
 
				+	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
			
 
				+	 * which doesn't reserve space for speed.  This is a problem since we
			
 
				+	 * still need to reserve space for this update, so try to reserve the
			
 
				+	 * space.
			
 
				+	 *
			
 
				+	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
			
 
				+	 * we're accounted for.
			
 
				+	 */
			
 
				+	if (!trans->bytes_reserved &&
			
 
				+	    src_rsv != &root->fs_info->delalloc_block_rsv) {
			
 
				+		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
			
 
				+		/*
			
 
				+		 * Since we're under a transaction reserve_metadata_bytes could
			
 
				+		 * try to commit the transaction which will make it return
			
 
				+		 * EAGAIN to make us stop the transaction we have, so return
			
 
				+		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
			
 
				+		 */
			
 
				+		if (ret == -EAGAIN)
			
 
				+			ret = -ENOSPC;
			
 
				+		if (!ret)
			
 
				+			node->bytes_reserved = num_bytes;
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
			
 
				 	if (!ret)
			
 
				 		node->bytes_reserved = num_bytes;
			
@@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 
				 	if (!node->bytes_reserved)
			
 
				 		return;
			
 
				 
			
 
				-	rsv = &root->fs_info->global_block_rsv;
			
 
				+	rsv = &root->fs_info->delayed_block_rsv;
			
 
				 	btrfs_block_rsv_release(root, rsv,
			
 
				 				node->bytes_reserved);
			
 
				 	node->bytes_reserved = 0;
			
@@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 
				 	path->leave_spinning = 1;
			
 
				 
			
 
				 	block_rsv = trans->block_rsv;
			
 
				-	trans->block_rsv = &root->fs_info->global_block_rsv;
			
 
				+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
			
 
				 
			
 
				 	delayed_root = btrfs_get_delayed_root(root);
			
 
				 
			
@@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 
				 	path->leave_spinning = 1;
			
 
				 
			
 
				 	block_rsv = trans->block_rsv;
			
 
				-	trans->block_rsv = &node->root->fs_info->global_block_rsv;
			
 
				+	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
			
 
				 
			
 
				 	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
			
 
				 	if (!ret)
			
@@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 
				 		goto free_path;
			
 
				 
			
 
				 	block_rsv = trans->block_rsv;
			
 
				-	trans->block_rsv = &root->fs_info->global_block_rsv;
			
 
				+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
			
 
				 
			
 
				 	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
			
 
				 	if (!ret)
			
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
			
 
				-	/*
			
 
				-	 * we must reserve enough space when we start a new transaction,
			
 
				-	 * so reserving metadata failure is impossible
			
 
				-	 */
			
 
				-	BUG_ON(ret);
			
 
				+	if (ret)
			
 
				+		goto release_node;
			
 
				 
			
 
				 	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
			
 
				 	delayed_node->inode_dirty = 1;
			
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
 
				 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
			
 
				 			   int verify)
			
 
				 {
			
 
				-	u16 csum_size =
			
 
				-		btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 	char *result = NULL;
			
 
				 	unsigned long len;
			
 
				 	unsigned long cur_len;
			
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 
				 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
			
 
				 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
			
 
				 	while (1) {
			
 
				-		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
			
 
				+		ret = read_extent_buffer_pages(io_tree, eb, start,
			
 
				+					       WAIT_COMPLETE,
			
 
				 					       btree_get_extent, mirror_num);
			
 
				 		if (!ret &&
			
 
				 		    !verify_parent_transid(io_tree, eb, parent_transid))
			
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
				 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
			
 
				 	end = eb->start + end - 1;
			
 
				 err:
			
 
				+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
			
 
				+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
			
 
				+		btree_readahead_hook(root, eb, eb->start, ret);
			
 
				+	}
			
 
				+
			
 
				 	free_extent_buffer(eb);
			
 
				 out:
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int btree_io_failed_hook(struct bio *failed_bio,
			
 
				+			 struct page *page, u64 start, u64 end,
			
 
				+			 u64 mirror_num, struct extent_state *state)
			
 
				+{
			
 
				+	struct extent_io_tree *tree;
			
 
				+	unsigned long len;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
			
 
				+
			
 
				+	tree = &BTRFS_I(page->mapping->host)->io_tree;
			
 
				+	if (page->private == EXTENT_PAGE_PRIVATE)
			
 
				+		goto out;
			
 
				+	if (!page->private)
			
 
				+		goto out;
			
 
				+
			
 
				+	len = page->private >> 2;
			
 
				+	WARN_ON(len == 0);
			
 
				+
			
 
				+	eb = alloc_extent_buffer(tree, start, len, page);
			
 
				+	if (eb == NULL)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
			
 
				+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
			
 
				+		btree_readahead_hook(root, eb, eb->start, -EIO);
			
 
				+	}
			
 
				+	free_extent_buffer(eb);
			
 
				+
			
 
				+out:
			
 
				+	return -EIO;	/* we fixed nothing */
			
 
				+}
			
 
				+
			
 
				 static void end_workqueue_bio(struct bio *bio, int err)
			
 
				 {
			
 
				 	struct end_io_wq *end_io_wq = bio->bi_private;
			
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
 
				 {
			
 
				 	struct extent_io_tree *tree;
			
 
				 	tree = &BTRFS_I(page->mapping->host)->io_tree;
			
 
				-	return extent_read_full_page(tree, page, btree_get_extent);
			
 
				+	return extent_read_full_page(tree, page, btree_get_extent, 0);
			
 
				 }
			
 
				 
			
 
				 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
			
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 
				 	if (!buf)
			
 
				 		return 0;
			
 
				 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
			
 
				-				 buf, 0, 0, btree_get_extent, 0);
			
 
				+				 buf, 0, WAIT_NONE, btree_get_extent, 0);
			
 
				 	free_extent_buffer(buf);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
			
 
				+			 int mirror_num, struct extent_buffer **eb)
			
 
				+{
			
 
				+	struct extent_buffer *buf = NULL;
			
 
				+	struct inode *btree_inode = root->fs_info->btree_inode;
			
 
				+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
			
 
				+	int ret;
			
 
				+
			
 
				+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
			
 
				+	if (!buf)
			
 
				+		return 0;
			
 
				+
			
 
				+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
			
 
				+
			
 
				+	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
			
 
				+				       btree_get_extent, mirror_num);
			
 
				+	if (ret) {
			
 
				+		free_extent_buffer(buf);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
			
 
				+		free_extent_buffer(buf);
			
 
				+		return -EIO;
			
 
				+	} else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
			
 
				+		*eb = buf;
			
 
				+	} else {
			
 
				+		free_extent_buffer(buf);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
			
 
				 					    u64 bytenr, u32 blocksize)
			
 
				 {
			
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 
				 
			
 
				 	generation = btrfs_root_generation(&root->root_item);
			
 
				 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
			
 
				+	root->commit_root = NULL;
			
 
				 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
			
 
				 				     blocksize, generation);
			
 
				 	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
			
 
				 		free_extent_buffer(root->node);
			
 
				+		root->node = NULL;
			
 
				 		return -EIO;
			
 
				 	}
			
 
				 	root->commit_root = btrfs_root_node(root);
			
@@ -1577,6 +1648,235 @@ static int transaction_kthread(void *arg)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * this will find the highest generation in the array of
			
 
				+ * root backups.  The index of the highest array is returned,
			
 
				+ * or -1 if we can't find anything.
			
 
				+ *
			
 
				+ * We check to make sure the array is valid by comparing the
			
 
				+ * generation of the latest  root in the array with the generation
			
 
				+ * in the super block.  If they don't match we pitch it.
			
 
				+ */
			
 
				+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
			
 
				+{
			
 
				+	u64 cur;
			
 
				+	int newest_index = -1;
			
 
				+	struct btrfs_root_backup *root_backup;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
			
 
				+		root_backup = info->super_copy->super_roots + i;
			
 
				+		cur = btrfs_backup_tree_root_gen(root_backup);
			
 
				+		if (cur == newest_gen)
			
 
				+			newest_index = i;
			
 
				+	}
			
 
				+
			
 
				+	/* check to see if we actually wrapped around */
			
 
				+	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
			
 
				+		root_backup = info->super_copy->super_roots;
			
 
				+		cur = btrfs_backup_tree_root_gen(root_backup);
			
 
				+		if (cur == newest_gen)
			
 
				+			newest_index = 0;
			
 
				+	}
			
 
				+	return newest_index;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * find the oldest backup so we know where to store new entries
			
 
				+ * in the backup array.  This will set the backup_root_index
			
 
				+ * field in the fs_info struct
			
 
				+ */
			
 
				+static void find_oldest_super_backup(struct btrfs_fs_info *info,
			
 
				+				     u64 newest_gen)
			
 
				+{
			
 
				+	int newest_index = -1;
			
 
				+
			
 
				+	newest_index = find_newest_super_backup(info, newest_gen);
			
 
				+	/* if there was garbage in there, just move along */
			
 
				+	if (newest_index == -1) {
			
 
				+		info->backup_root_index = 0;
			
 
				+	} else {
			
 
				+		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * copy all the root pointers into the super backup array.
			
 
				+ * this will bump the backup pointer by one when it is
			
 
				+ * done
			
 
				+ */
			
 
				+static void backup_super_roots(struct btrfs_fs_info *info)
			
 
				+{
			
 
				+	int next_backup;
			
 
				+	struct btrfs_root_backup *root_backup;
			
 
				+	int last_backup;
			
 
				+
			
 
				+	next_backup = info->backup_root_index;
			
 
				+	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
			
 
				+		BTRFS_NUM_BACKUP_ROOTS;
			
 
				+
			
 
				+	/*
			
 
				+	 * just overwrite the last backup if we're at the same generation
			
 
				+	 * this happens only at umount
			
 
				+	 */
			
 
				+	root_backup = info->super_for_commit->super_roots + last_backup;
			
 
				+	if (btrfs_backup_tree_root_gen(root_backup) ==
			
 
				+	    btrfs_header_generation(info->tree_root->node))
			
 
				+		next_backup = last_backup;
			
 
				+
			
 
				+	root_backup = info->super_for_commit->super_roots + next_backup;
			
 
				+
			
 
				+	/*
			
 
				+	 * make sure all of our padding and empty slots get zero filled
			
 
				+	 * regardless of which ones we use today
			
 
				+	 */
			
 
				+	memset(root_backup, 0, sizeof(*root_backup));
			
 
				+
			
 
				+	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
			
 
				+
			
 
				+	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
			
 
				+	btrfs_set_backup_tree_root_gen(root_backup,
			
 
				+			       btrfs_header_generation(info->tree_root->node));
			
 
				+
			
 
				+	btrfs_set_backup_tree_root_level(root_backup,
			
 
				+			       btrfs_header_level(info->tree_root->node));
			
 
				+
			
 
				+	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
			
 
				+	btrfs_set_backup_chunk_root_gen(root_backup,
			
 
				+			       btrfs_header_generation(info->chunk_root->node));
			
 
				+	btrfs_set_backup_chunk_root_level(root_backup,
			
 
				+			       btrfs_header_level(info->chunk_root->node));
			
 
				+
			
 
				+	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
			
 
				+	btrfs_set_backup_extent_root_gen(root_backup,
			
 
				+			       btrfs_header_generation(info->extent_root->node));
			
 
				+	btrfs_set_backup_extent_root_level(root_backup,
			
 
				+			       btrfs_header_level(info->extent_root->node));
			
 
				+
			
 
				+	/*
			
 
				+	 * we might commit during log recovery, which happens before we set
			
 
				+	 * the fs_root.  Make sure it is valid before we fill it in.
			
 
				+	 */
			
 
				+	if (info->fs_root && info->fs_root->node) {
			
 
				+		btrfs_set_backup_fs_root(root_backup,
			
 
				+					 info->fs_root->node->start);
			
 
				+		btrfs_set_backup_fs_root_gen(root_backup,
			
 
				+			       btrfs_header_generation(info->fs_root->node));
			
 
				+		btrfs_set_backup_fs_root_level(root_backup,
			
 
				+			       btrfs_header_level(info->fs_root->node));
			
 
				+	}
			
 
				+
			
 
				+	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
			
 
				+	btrfs_set_backup_dev_root_gen(root_backup,
			
 
				+			       btrfs_header_generation(info->dev_root->node));
			
 
				+	btrfs_set_backup_dev_root_level(root_backup,
			
 
				+				       btrfs_header_level(info->dev_root->node));
			
 
				+
			
 
				+	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
			
 
				+	btrfs_set_backup_csum_root_gen(root_backup,
			
 
				+			       btrfs_header_generation(info->csum_root->node));
			
 
				+	btrfs_set_backup_csum_root_level(root_backup,
			
 
				+			       btrfs_header_level(info->csum_root->node));
			
 
				+
			
 
				+	btrfs_set_backup_total_bytes(root_backup,
			
 
				+			     btrfs_super_total_bytes(info->super_copy));
			
 
				+	btrfs_set_backup_bytes_used(root_backup,
			
 
				+			     btrfs_super_bytes_used(info->super_copy));
			
 
				+	btrfs_set_backup_num_devices(root_backup,
			
 
				+			     btrfs_super_num_devices(info->super_copy));
			
 
				+
			
 
				+	/*
			
 
				+	 * if we don't copy this out to the super_copy, it won't get remembered
			
 
				+	 * for the next commit
			
 
				+	 */
			
 
				+	memcpy(&info->super_copy->super_roots,
			
 
				+	       &info->super_for_commit->super_roots,
			
 
				+	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this copies info out of the root backup array and back into
			
 
				+ * the in-memory super block.  It is meant to help iterate through
			
 
				+ * the array, so you send it the number of backups you've already
			
 
				+ * tried and the last backup index you used.
			
 
				+ *
			
 
				+ * this returns -1 when it has tried all the backups
			
 
				+ */
			
 
				+static noinline int next_root_backup(struct btrfs_fs_info *info,
			
 
				+				     struct btrfs_super_block *super,
			
 
				+				     int *num_backups_tried, int *backup_index)
			
 
				+{
			
 
				+	struct btrfs_root_backup *root_backup;
			
 
				+	int newest = *backup_index;
			
 
				+
			
 
				+	if (*num_backups_tried == 0) {
			
 
				+		u64 gen = btrfs_super_generation(super);
			
 
				+
			
 
				+		newest = find_newest_super_backup(info, gen);
			
 
				+		if (newest == -1)
			
 
				+			return -1;
			
 
				+
			
 
				+		*backup_index = newest;
			
 
				+		*num_backups_tried = 1;
			
 
				+	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
			
 
				+		/* we've tried all the backups, all done */
			
 
				+		return -1;
			
 
				+	} else {
			
 
				+		/* jump to the next oldest backup */
			
 
				+		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
			
 
				+			BTRFS_NUM_BACKUP_ROOTS;
			
 
				+		*backup_index = newest;
			
 
				+		*num_backups_tried += 1;
			
 
				+	}
			
 
				+	root_backup = super->super_roots + newest;
			
 
				+
			
 
				+	btrfs_set_super_generation(super,
			
 
				+				   btrfs_backup_tree_root_gen(root_backup));
			
 
				+	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
			
 
				+	btrfs_set_super_root_level(super,
			
 
				+				   btrfs_backup_tree_root_level(root_backup));
			
 
				+	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
			
 
				+
			
 
				+	/*
			
 
				+	 * fixme: the total bytes and num_devices need to match or we should
			
 
				+	 * need a fsck
			
 
				+	 */
			
 
				+	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
			
 
				+	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* helper to cleanup tree roots */
			
 
				+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
			
 
				+{
			
 
				+	free_extent_buffer(info->tree_root->node);
			
 
				+	free_extent_buffer(info->tree_root->commit_root);
			
 
				+	free_extent_buffer(info->dev_root->node);
			
 
				+	free_extent_buffer(info->dev_root->commit_root);
			
 
				+	free_extent_buffer(info->extent_root->node);
			
 
				+	free_extent_buffer(info->extent_root->commit_root);
			
 
				+	free_extent_buffer(info->csum_root->node);
			
 
				+	free_extent_buffer(info->csum_root->commit_root);
			
 
				+
			
 
				+	info->tree_root->node = NULL;
			
 
				+	info->tree_root->commit_root = NULL;
			
 
				+	info->dev_root->node = NULL;
			
 
				+	info->dev_root->commit_root = NULL;
			
 
				+	info->extent_root->node = NULL;
			
 
				+	info->extent_root->commit_root = NULL;
			
 
				+	info->csum_root->node = NULL;
			
 
				+	info->csum_root->commit_root = NULL;
			
 
				+
			
 
				+	if (chunk_root) {
			
 
				+		free_extent_buffer(info->chunk_root->node);
			
 
				+		free_extent_buffer(info->chunk_root->commit_root);
			
 
				+		info->chunk_root->node = NULL;
			
 
				+		info->chunk_root->commit_root = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 struct btrfs_root *open_ctree(struct super_block *sb,
			
 
				 			      struct btrfs_fs_devices *fs_devices,
			
 
				 			      char *options)
			
@@ -1604,6 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 
			
 
				 	int ret;
			
 
				 	int err = -EINVAL;
			
 
				+	int num_backups_tried = 0;
			
 
				+	int backup_index = 0;
			
 
				 
			
 
				 	struct btrfs_super_block *disk_super;
			
 
				 
			
@@ -1648,6 +1950,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	spin_lock_init(&fs_info->fs_roots_radix_lock);
			
 
				 	spin_lock_init(&fs_info->delayed_iput_lock);
			
 
				 	spin_lock_init(&fs_info->defrag_inodes_lock);
			
 
				+	spin_lock_init(&fs_info->free_chunk_lock);
			
 
				 	mutex_init(&fs_info->reloc_mutex);
			
 
				 
			
 
				 	init_completion(&fs_info->kobj_unregister);
			
@@ -1665,8 +1968,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
			
 
				 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
			
 
				 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
			
 
				-	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
			
 
				-	mutex_init(&fs_info->durable_block_rsv_mutex);
			
 
				+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
			
 
				 	atomic_set(&fs_info->nr_async_submits, 0);
			
 
				 	atomic_set(&fs_info->async_delalloc_pages, 0);
			
 
				 	atomic_set(&fs_info->async_submit_draining, 0);
			
@@ -1677,6 +1979,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	fs_info->metadata_ratio = 0;
			
 
				 	fs_info->defrag_inodes = RB_ROOT;
			
 
				 	fs_info->trans_no_join = 0;
			
 
				+	fs_info->free_chunk_space = 0;
			
 
				+
			
 
				+	/* readahead state */
			
 
				+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
			
 
				+	spin_lock_init(&fs_info->reada_lock);
			
 
				 
			
 
				 	fs_info->thread_pool_size = min_t(unsigned long,
			
 
				 					  num_online_cpus() + 2, 8);
			
@@ -1766,14 +2073,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 		goto fail_alloc;
			
 
				 	}
			
 
				 
			
 
				-	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
			
 
				-	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
			
 
				-	       sizeof(fs_info->super_for_commit));
			
 
				+	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
			
 
				+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
			
 
				+	       sizeof(*fs_info->super_for_commit));
			
 
				 	brelse(bh);
			
 
				 
			
 
				-	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
			
 
				+	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
			
 
				 
			
 
				-	disk_super = &fs_info->super_copy;
			
 
				+	disk_super = fs_info->super_copy;
			
 
				 	if (!btrfs_super_root(disk_super))
			
 
				 		goto fail_alloc;
			
 
				 
			
@@ -1782,6 +2089,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 
			
 
				 	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
			
 
				 
			
 
				+	/*
			
 
				+	 * run through our array of backup supers and setup
			
 
				+	 * our ring pointer to the oldest one
			
 
				+	 */
			
 
				+	generation = btrfs_super_generation(disk_super);
			
 
				+	find_oldest_super_backup(fs_info, generation);
			
 
				+
			
 
				 	/*
			
 
				 	 * In the long term, we'll store the compression type in the super
			
 
				 	 * block, and it'll be used for per file compression control.
			
@@ -1870,6 +2184,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
			
 
				 			   fs_info->thread_pool_size,
			
 
				 			   &fs_info->generic_worker);
			
 
				+	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
			
 
				+			   fs_info->thread_pool_size,
			
 
				+			   &fs_info->generic_worker);
			
 
				 
			
 
				 	/*
			
 
				 	 * endios are largely parallel and should have a very
			
@@ -1880,6 +2197,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 
			
 
				 	fs_info->endio_write_workers.idle_thresh = 2;
			
 
				 	fs_info->endio_meta_write_workers.idle_thresh = 2;
			
 
				+	fs_info->readahead_workers.idle_thresh = 2;
			
 
				 
			
 
				 	btrfs_start_workers(&fs_info->workers, 1);
			
 
				 	btrfs_start_workers(&fs_info->generic_worker, 1);
			
@@ -1893,6 +2211,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
			
 
				 	btrfs_start_workers(&fs_info->delayed_workers, 1);
			
 
				 	btrfs_start_workers(&fs_info->caching_workers, 1);
			
 
				+	btrfs_start_workers(&fs_info->readahead_workers, 1);
			
 
				 
			
 
				 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
			
 
				 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
			
@@ -1939,7 +2258,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
			
 
				 		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
			
 
				 		       sb->s_id);
			
 
				-		goto fail_chunk_root;
			
 
				+		goto fail_tree_roots;
			
 
				 	}
			
 
				 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
			
 
				 	chunk_root->commit_root = btrfs_root_node(chunk_root);
			
@@ -1954,11 +2273,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	if (ret) {
			
 
				 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
			
 
				 		       sb->s_id);
			
 
				-		goto fail_chunk_root;
			
 
				+		goto fail_tree_roots;
			
 
				 	}
			
 
				 
			
 
				 	btrfs_close_extra_devices(fs_devices);
			
 
				 
			
 
				+retry_root_backup:
			
 
				 	blocksize = btrfs_level_size(tree_root,
			
 
				 				     btrfs_super_root_level(disk_super));
			
 
				 	generation = btrfs_super_generation(disk_super);
			
@@ -1966,32 +2286,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	tree_root->node = read_tree_block(tree_root,
			
 
				 					  btrfs_super_root(disk_super),
			
 
				 					  blocksize, generation);
			
 
				-	if (!tree_root->node)
			
 
				-		goto fail_chunk_root;
			
 
				-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
			
 
				+	if (!tree_root->node ||
			
 
				+	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
			
 
				 		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
			
 
				 		       sb->s_id);
			
 
				-		goto fail_tree_root;
			
 
				+
			
 
				+		goto recovery_tree_root;
			
 
				 	}
			
 
				+
			
 
				 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
			
 
				 	tree_root->commit_root = btrfs_root_node(tree_root);
			
 
				 
			
 
				 	ret = find_and_setup_root(tree_root, fs_info,
			
 
				 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
			
 
				 	if (ret)
			
 
				-		goto fail_tree_root;
			
 
				+		goto recovery_tree_root;
			
 
				 	extent_root->track_dirty = 1;
			
 
				 
			
 
				 	ret = find_and_setup_root(tree_root, fs_info,
			
 
				 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
			
 
				 	if (ret)
			
 
				-		goto fail_extent_root;
			
 
				+		goto recovery_tree_root;
			
 
				 	dev_root->track_dirty = 1;
			
 
				 
			
 
				 	ret = find_and_setup_root(tree_root, fs_info,
			
 
				 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
			
 
				 	if (ret)
			
 
				-		goto fail_dev_root;
			
 
				+		goto recovery_tree_root;
			
 
				 
			
 
				 	csum_root->track_dirty = 1;
			
 
				 
			
@@ -2124,22 +2445,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 
			
 
				 fail_block_groups:
			
 
				 	btrfs_free_block_groups(fs_info);
			
 
				-	free_extent_buffer(csum_root->node);
			
 
				-	free_extent_buffer(csum_root->commit_root);
			
 
				-fail_dev_root:
			
 
				-	free_extent_buffer(dev_root->node);
			
 
				-	free_extent_buffer(dev_root->commit_root);
			
 
				-fail_extent_root:
			
 
				-	free_extent_buffer(extent_root->node);
			
 
				-	free_extent_buffer(extent_root->commit_root);
			
 
				-fail_tree_root:
			
 
				-	free_extent_buffer(tree_root->node);
			
 
				-	free_extent_buffer(tree_root->commit_root);
			
 
				-fail_chunk_root:
			
 
				-	free_extent_buffer(chunk_root->node);
			
 
				-	free_extent_buffer(chunk_root->commit_root);
			
 
				+
			
 
				+fail_tree_roots:
			
 
				+	free_root_pointers(fs_info, 1);
			
 
				+
			
 
				 fail_sb_buffer:
			
 
				 	btrfs_stop_workers(&fs_info->generic_worker);
			
 
				+	btrfs_stop_workers(&fs_info->readahead_workers);
			
 
				 	btrfs_stop_workers(&fs_info->fixup_workers);
			
 
				 	btrfs_stop_workers(&fs_info->delalloc_workers);
			
 
				 	btrfs_stop_workers(&fs_info->workers);
			
@@ -2152,7 +2464,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	btrfs_stop_workers(&fs_info->delayed_workers);
			
 
				 	btrfs_stop_workers(&fs_info->caching_workers);
			
 
				 fail_alloc:
			
 
				-	kfree(fs_info->delayed_root);
			
 
				 fail_iput:
			
 
				 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
			
 
				 	iput(fs_info->btree_inode);
			
@@ -2164,13 +2475,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 fail_srcu:
			
 
				 	cleanup_srcu_struct(&fs_info->subvol_srcu);
			
 
				 fail:
			
 
				-	kfree(extent_root);
			
 
				-	kfree(tree_root);
			
 
				-	kfree(fs_info);
			
 
				-	kfree(chunk_root);
			
 
				-	kfree(dev_root);
			
 
				-	kfree(csum_root);
			
 
				+	free_fs_info(fs_info);
			
 
				 	return ERR_PTR(err);
			
 
				+
			
 
				+recovery_tree_root:
			
 
				+
			
 
				+	if (!btrfs_test_opt(tree_root, RECOVERY))
			
 
				+		goto fail_tree_roots;
			
 
				+
			
 
				+	free_root_pointers(fs_info, 0);
			
 
				+
			
 
				+	/* don't use the log in recovery mode, it won't be valid */
			
 
				+	btrfs_set_super_log_root(disk_super, 0);
			
 
				+
			
 
				+	/* we can't trust the free space cache either */
			
 
				+	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
			
 
				+
			
 
				+	ret = next_root_backup(fs_info, fs_info->super_copy,
			
 
				+			       &num_backups_tried, &backup_index);
			
 
				+	if (ret == -1)
			
 
				+		goto fail_block_groups;
			
 
				+	goto retry_root_backup;
			
 
				 }
			
 
				 
			
 
				 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
			
@@ -2338,10 +2663,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
				 	int total_errors = 0;
			
 
				 	u64 flags;
			
 
				 
			
 
				-	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
			
 
				+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
			
 
				 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
			
 
				+	backup_super_roots(root->fs_info);
			
 
				 
			
 
				-	sb = &root->fs_info->super_for_commit;
			
 
				+	sb = root->fs_info->super_for_commit;
			
 
				 	dev_item = &sb->dev_item;
			
 
				 
			
 
				 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
@@ -2545,8 +2871,6 @@ int close_ctree(struct btrfs_root *root)
 
				 	/* clear out the rbtree of defraggable inodes */
			
 
				 	btrfs_run_defrag_inodes(root->fs_info);
			
 
				 
			
 
				-	btrfs_put_block_group_cache(fs_info);
			
 
				-
			
 
				 	/*
			
 
				 	 * Here come 2 situations when btrfs is broken to flip readonly:
			
 
				 	 *
			
@@ -2572,6 +2896,8 @@ int close_ctree(struct btrfs_root *root)
 
				 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
			
 
				 	}
			
 
				 
			
 
				+	btrfs_put_block_group_cache(fs_info);
			
 
				+
			
 
				 	kthread_stop(root->fs_info->transaction_kthread);
			
 
				 	kthread_stop(root->fs_info->cleaner_kthread);
			
 
				 
			
@@ -2603,7 +2929,6 @@ int close_ctree(struct btrfs_root *root)
 
				 	del_fs_roots(fs_info);
			
 
				 
			
 
				 	iput(fs_info->btree_inode);
			
 
				-	kfree(fs_info->delayed_root);
			
 
				 
			
 
				 	btrfs_stop_workers(&fs_info->generic_worker);
			
 
				 	btrfs_stop_workers(&fs_info->fixup_workers);
			
@@ -2617,6 +2942,7 @@ int close_ctree(struct btrfs_root *root)
 
				 	btrfs_stop_workers(&fs_info->submit_workers);
			
 
				 	btrfs_stop_workers(&fs_info->delayed_workers);
			
 
				 	btrfs_stop_workers(&fs_info->caching_workers);
			
 
				+	btrfs_stop_workers(&fs_info->readahead_workers);
			
 
				 
			
 
				 	btrfs_close_devices(fs_info->fs_devices);
			
 
				 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
			
@@ -2624,12 +2950,7 @@ int close_ctree(struct btrfs_root *root)
 
				 	bdi_destroy(&fs_info->bdi);
			
 
				 	cleanup_srcu_struct(&fs_info->subvol_srcu);
			
 
				 
			
 
				-	kfree(fs_info->extent_root);
			
 
				-	kfree(fs_info->tree_root);
			
 
				-	kfree(fs_info->chunk_root);
			
 
				-	kfree(fs_info->dev_root);
			
 
				-	kfree(fs_info->csum_root);
			
 
				-	kfree(fs_info);
			
 
				+	free_fs_info(fs_info);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -2735,7 +3056,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int btree_lock_page_hook(struct page *page)
			
 
				+static int btree_lock_page_hook(struct page *page, void *data,
			
 
				+				void (*flush_fn)(void *))
			
 
				 {
			
 
				 	struct inode *inode = page->mapping->host;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
@@ -2752,7 +3074,10 @@ int btree_lock_page_hook(struct page *page)
 
				 	if (!eb)
			
 
				 		goto out;
			
 
				 
			
 
				-	btrfs_tree_lock(eb);
			
 
				+	if (!btrfs_try_tree_write_lock(eb)) {
			
 
				+		flush_fn(data);
			
 
				+		btrfs_tree_lock(eb);
			
 
				+	}
			
 
				 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
			
 
				 
			
 
				 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
			
@@ -2767,7 +3092,10 @@ int btree_lock_page_hook(struct page *page)
 
				 	btrfs_tree_unlock(eb);
			
 
				 	free_extent_buffer(eb);
			
 
				 out:
			
 
				-	lock_page(page);
			
 
				+	if (!trylock_page(page)) {
			
 
				+		flush_fn(data);
			
 
				+		lock_page(page);
			
 
				+	}
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -3123,6 +3451,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 
				 static struct extent_io_ops btree_extent_io_ops = {
			
 
				 	.write_cache_pages_lock_hook = btree_lock_page_hook,
			
 
				 	.readpage_end_io_hook = btree_readpage_end_io_hook,
			
 
				+	.readpage_io_failed_hook = btree_io_failed_hook,
			
 
				 	.submit_bio_hook = btree_submit_bio_hook,
			
 
				 	/* note we're sharing with inode.c for the merge bio hook */
			
 
				 	.merge_bio_hook = btrfs_merge_bio_hook,
			
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
				 				      u32 blocksize, u64 parent_transid);
			
 
				 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
			
 
				 			 u64 parent_transid);
			
 
				+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
			
 
				+			 int mirror_num, struct extent_buffer **eb);
			
 
				 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
			
 
				 						   u64 bytenr, u32 blocksize);
			
 
				 int clean_tree_block(struct btrfs_trans_handle *trans,
			
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 
				 			     struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
			
 
				 		       struct btrfs_root *root);
			
 
				-int btree_lock_page_hook(struct page *page);
			
 
				-
			
 
				 
			
 
				 #ifdef CONFIG_DEBUG_LOCK_ALLOC
			
 
				 void btrfs_init_lockdep(void);
			
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
 
				 #include <linux/rcupdate.h>
			
 
				 #include <linux/kthread.h>
			
 
				 #include <linux/slab.h>
			
 
				+#include <linux/ratelimit.h>
			
 
				 #include "compat.h"
			
 
				 #include "hash.h"
			
 
				 #include "ctree.h"
			
@@ -52,6 +53,21 @@ enum {
 
				 	CHUNK_ALLOC_LIMITED = 2,
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * Control how reservations are dealt with.
			
 
				+ *
			
 
				+ * RESERVE_FREE - freeing a reservation.
			
 
				+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
			
 
				+ *   ENOSPC accounting
			
 
				+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
			
 
				+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
			
 
				+ */
			
 
				+enum {
			
 
				+	RESERVE_FREE = 0,
			
 
				+	RESERVE_ALLOC = 1,
			
 
				+	RESERVE_ALLOC_NO_ACCOUNT = 2,
			
 
				+};
			
 
				+
			
 
				 static int update_block_group(struct btrfs_trans_handle *trans,
			
 
				 			      struct btrfs_root *root,
			
 
				 			      u64 bytenr, u64 num_bytes, int alloc);
			
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 
				 			 struct btrfs_key *key);
			
 
				 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
			
 
				 			    int dump_block_groups);
			
 
				+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
			
 
				+				       u64 num_bytes, int reserve);
			
 
				 
			
 
				 static noinline int
			
 
				 block_group_cache_done(struct btrfs_block_group_cache *cache)
			
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 
				 	if (atomic_dec_and_test(&cache->count)) {
			
 
				 		WARN_ON(cache->pinned > 0);
			
 
				 		WARN_ON(cache->reserved > 0);
			
 
				-		WARN_ON(cache->reserved_pinned > 0);
			
 
				 		kfree(cache->free_space_ctl);
			
 
				 		kfree(cache);
			
 
				 	}
			
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
				 	 * we likely hold important locks.
			
 
				 	 */
			
 
				 	if (trans && (!trans->transaction->in_commit) &&
			
 
				-	    (root && root != root->fs_info->tree_root)) {
			
 
				+	    (root && root != root->fs_info->tree_root) &&
			
 
				+	    btrfs_test_opt(root, SPACE_CACHE)) {
			
 
				 		spin_lock(&cache->lock);
			
 
				 		if (cache->cached != BTRFS_CACHE_NO) {
			
 
				 			spin_unlock(&cache->lock);
			
@@ -1770,18 +1788,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
				 {
			
 
				 	int ret;
			
 
				 	u64 discarded_bytes = 0;
			
 
				-	struct btrfs_multi_bio *multi = NULL;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				 
			
 
				 
			
 
				 	/* Tell the block device(s) that the sectors can be discarded */
			
 
				 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
			
 
				-			      bytenr, &num_bytes, &multi, 0);
			
 
				+			      bytenr, &num_bytes, &bbio, 0);
			
 
				 	if (!ret) {
			
 
				-		struct btrfs_bio_stripe *stripe = multi->stripes;
			
 
				+		struct btrfs_bio_stripe *stripe = bbio->stripes;
			
 
				 		int i;
			
 
				 
			
 
				 
			
 
				-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
			
 
				+		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
			
 
				 			if (!stripe->dev->can_discard)
			
 
				 				continue;
			
 
				 
			
@@ -1800,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
				 			 */
			
 
				 			ret = 0;
			
 
				 		}
			
 
				-		kfree(multi);
			
 
				+		kfree(bbio);
			
 
				 	}
			
 
				 
			
 
				 	if (actual_bytes)
			
@@ -2700,6 +2718,13 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 
				 		goto again;
			
 
				 	}
			
 
				 
			
 
				+	/* We've already setup this transaction, go ahead and exit */
			
 
				+	if (block_group->cache_generation == trans->transid &&
			
 
				+	    i_size_read(inode)) {
			
 
				+		dcs = BTRFS_DC_SETUP;
			
 
				+		goto out_put;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * We want to set the generation to 0, that way if anything goes wrong
			
 
				 	 * from here on out we know not to trust this cache when we load up next
			
@@ -2749,12 +2774,15 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 
				 	if (!ret)
			
 
				 		dcs = BTRFS_DC_SETUP;
			
 
				 	btrfs_free_reserved_data_space(inode, num_pages);
			
 
				+
			
 
				 out_put:
			
 
				 	iput(inode);
			
 
				 out_free:
			
 
				 	btrfs_release_path(path);
			
 
				 out:
			
 
				 	spin_lock(&block_group->lock);
			
 
				+	if (!ret)
			
 
				+		block_group->cache_generation = trans->transid;
			
 
				 	block_group->disk_cache_state = dcs;
			
 
				 	spin_unlock(&block_group->lock);
			
 
				 
			
@@ -3122,16 +3150,13 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 
				 		return -ENOSPC;
			
 
				 	}
			
 
				 	data_sinfo->bytes_may_use += bytes;
			
 
				-	BTRFS_I(inode)->reserved_bytes += bytes;
			
 
				 	spin_unlock(&data_sinfo->lock);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * called when we are clearing an delalloc extent from the
			
 
				- * inode's io_tree or there was an error for whatever reason
			
 
				- * after calling btrfs_check_data_free_space
			
 
				+ * Called if we need to clear a data reservation for this inode.
			
 
				  */
			
 
				 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
			
 
				 {
			
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 
				 	data_sinfo = BTRFS_I(inode)->space_info;
			
 
				 	spin_lock(&data_sinfo->lock);
			
 
				 	data_sinfo->bytes_may_use -= bytes;
			
 
				-	BTRFS_I(inode)->reserved_bytes -= bytes;
			
 
				 	spin_unlock(&data_sinfo->lock);
			
 
				 }
			
 
				 
			
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 
				 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
			
 
				 			      int force)
			
 
				 {
			
 
				+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
			
 
				 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
			
 
				 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
			
 
				 	u64 thresh;
			
@@ -3172,12 +3197,19 @@ static int should_alloc_chunk(struct btrfs_root *root,
 
				 	if (force == CHUNK_ALLOC_FORCE)
			
 
				 		return 1;
			
 
				 
			
 
				+	/*
			
 
				+	 * We need to take into account the global rsv because for all intents
			
 
				+	 * and purposes it's used space.  Don't worry about locking the
			
 
				+	 * global_rsv, it doesn't change except when the transaction commits.
			
 
				+	 */
			
 
				+	num_allocated += global_rsv->size;
			
 
				+
			
 
				 	/*
			
 
				 	 * in limited mode, we want to have some free space up to
			
 
				 	 * about 1% of the FS size.
			
 
				 	 */
			
 
				 	if (force == CHUNK_ALLOC_LIMITED) {
			
 
				-		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
			
 
				+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
			
 
				 		thresh = max_t(u64, 64 * 1024 * 1024,
			
 
				 			       div_factor_fine(thresh, 1));
			
 
				 
			
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 
				 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
			
 
				 		return 0;
			
 
				 
			
 
				-	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
			
 
				+	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
			
 
				 
			
 
				 	/* 256MB or 5% of the FS */
			
 
				 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
			
@@ -3302,24 +3334,26 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 
				 /*
			
 
				  * shrink metadata reservation for delalloc
			
 
				  */
			
 
				-static int shrink_delalloc(struct btrfs_trans_handle *trans,
			
 
				-			   struct btrfs_root *root, u64 to_reclaim, int sync)
			
 
				+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
			
 
				+			   bool wait_ordered)
			
 
				 {
			
 
				 	struct btrfs_block_rsv *block_rsv;
			
 
				 	struct btrfs_space_info *space_info;
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				 	u64 reserved;
			
 
				 	u64 max_reclaim;
			
 
				 	u64 reclaimed = 0;
			
 
				 	long time_left;
			
 
				-	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
			
 
				+	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
			
 
				 	int loops = 0;
			
 
				 	unsigned long progress;
			
 
				 
			
 
				+	trans = (struct btrfs_trans_handle *)current->journal_info;
			
 
				 	block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				 	space_info = block_rsv->space_info;
			
 
				 
			
 
				 	smp_mb();
			
 
				-	reserved = space_info->bytes_reserved;
			
 
				+	reserved = space_info->bytes_may_use;
			
 
				 	progress = space_info->reservation_progress;
			
 
				 
			
 
				 	if (reserved == 0)
			
@@ -3334,7 +3368,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	max_reclaim = min(reserved, to_reclaim);
			
 
				-
			
 
				+	nr_pages = max_t(unsigned long, nr_pages,
			
 
				+			 max_reclaim >> PAGE_CACHE_SHIFT);
			
 
				 	while (loops < 1024) {
			
 
				 		/* have the flusher threads jump in and do some IO */
			
 
				 		smp_mb();
			
@@ -3344,9 +3379,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
				 						WB_REASON_FS_FREE_SPACE);
			
 
				 
			
 
				 		spin_lock(&space_info->lock);
			
 
				-		if (reserved > space_info->bytes_reserved)
			
 
				-			reclaimed += reserved - space_info->bytes_reserved;
			
 
				-		reserved = space_info->bytes_reserved;
			
 
				+		if (reserved > space_info->bytes_may_use)
			
 
				+			reclaimed += reserved - space_info->bytes_may_use;
			
 
				+		reserved = space_info->bytes_may_use;
			
 
				 		spin_unlock(&space_info->lock);
			
 
				 
			
 
				 		loops++;
			
@@ -3357,11 +3392,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
				 		if (trans && trans->transaction->blocked)
			
 
				 			return -EAGAIN;
			
 
				 
			
 
				-		time_left = schedule_timeout_interruptible(1);
			
 
				+		if (wait_ordered && !trans) {
			
 
				+			btrfs_wait_ordered_extents(root, 0, 0);
			
 
				+		} else {
			
 
				+			time_left = schedule_timeout_interruptible(1);
			
 
				 
			
 
				-		/* We were interrupted, exit */
			
 
				-		if (time_left)
			
 
				-			break;
			
 
				+			/* We were interrupted, exit */
			
 
				+			if (time_left)
			
 
				+				break;
			
 
				+		}
			
 
				 
			
 
				 		/* we've kicked the IO a few times, if anything has been freed,
			
 
				 		 * exit.  There is no sense in looping here for a long time
			
@@ -3376,34 +3415,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
				 		}
			
 
				 
			
 
				 	}
			
 
				-	if (reclaimed >= to_reclaim && !trans)
			
 
				-		btrfs_wait_ordered_extents(root, 0, 0);
			
 
				+
			
 
				 	return reclaimed >= to_reclaim;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Retries tells us how many times we've called reserve_metadata_bytes.  The
			
 
				- * idea is if this is the first call (retries == 0) then we will add to our
			
 
				- * reserved count if we can't make the allocation in order to hold our place
			
 
				- * while we go and try and free up space.  That way for retries > 1 we don't try
			
 
				- * and add space, we just check to see if the amount of unused space is >= the
			
 
				- * total space, meaning that our reservation is valid.
			
 
				+/**
			
 
				+ * maybe_commit_transaction - possibly commit the transaction if its ok to
			
 
				+ * @root - the root we're allocating for
			
 
				+ * @bytes - the number of bytes we want to reserve
			
 
				+ * @force - force the commit
			
 
				  *
			
 
				- * However if we don't intend to retry this reservation, pass -1 as retries so
			
 
				- * that it short circuits this logic.
			
 
				+ * This will check to make sure that committing the transaction will actually
			
 
				+ * get us somewhere and then commit the transaction if it does.  Otherwise it
			
 
				+ * will return -ENOSPC.
			
 
				  */
			
 
				-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
			
 
				-				  struct btrfs_root *root,
			
 
				+static int may_commit_transaction(struct btrfs_root *root,
			
 
				+				  struct btrfs_space_info *space_info,
			
 
				+				  u64 bytes, int force)
			
 
				+{
			
 
				+	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+
			
 
				+	trans = (struct btrfs_trans_handle *)current->journal_info;
			
 
				+	if (trans)
			
 
				+		return -EAGAIN;
			
 
				+
			
 
				+	if (force)
			
 
				+		goto commit;
			
 
				+
			
 
				+	/* See if there is enough pinned space to make this reservation */
			
 
				+	spin_lock(&space_info->lock);
			
 
				+	if (space_info->bytes_pinned >= bytes) {
			
 
				+		spin_unlock(&space_info->lock);
			
 
				+		goto commit;
			
 
				+	}
			
 
				+	spin_unlock(&space_info->lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * See if there is some space in the delayed insertion reservation for
			
 
				+	 * this reservation.
			
 
				+	 */
			
 
				+	if (space_info != delayed_rsv->space_info)
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	spin_lock(&delayed_rsv->lock);
			
 
				+	if (delayed_rsv->size < bytes) {
			
 
				+		spin_unlock(&delayed_rsv->lock);
			
 
				+		return -ENOSPC;
			
 
				+	}
			
 
				+	spin_unlock(&delayed_rsv->lock);
			
 
				+
			
 
				+commit:
			
 
				+	trans = btrfs_join_transaction(root);
			
 
				+	if (IS_ERR(trans))
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	return btrfs_commit_transaction(trans, root);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
			
 
				+ * @root - the root we're allocating for
			
 
				+ * @block_rsv - the block_rsv we're allocating for
			
 
				+ * @orig_bytes - the number of bytes we want
			
 
				+ * @flush - wether or not we can flush to make our reservation
			
 
				+ *
			
 
				+ * This will reserve orgi_bytes number of bytes from the space info associated
			
 
				+ * with the block_rsv.  If there is not enough space it will make an attempt to
			
 
				+ * flush out space to make room.  It will do this by flushing delalloc if
			
 
				+ * possible or committing the transaction.  If flush is 0 then no attempts to
			
 
				+ * regain reservations will be made and this will fail if there is not enough
			
 
				+ * space already.
			
 
				+ */
			
 
				+static int reserve_metadata_bytes(struct btrfs_root *root,
			
 
				 				  struct btrfs_block_rsv *block_rsv,
			
 
				 				  u64 orig_bytes, int flush)
			
 
				 {
			
 
				 	struct btrfs_space_info *space_info = block_rsv->space_info;
			
 
				-	u64 unused;
			
 
				+	u64 used;
			
 
				 	u64 num_bytes = orig_bytes;
			
 
				 	int retries = 0;
			
 
				 	int ret = 0;
			
 
				 	bool committed = false;
			
 
				 	bool flushing = false;
			
 
				+	bool wait_ordered = false;
			
 
				 
			
 
				 again:
			
 
				 	ret = 0;
			
@@ -3420,7 +3515,7 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 		 * deadlock since we are waiting for the flusher to finish, but
			
 
				 		 * hold the current transaction open.
			
 
				 		 */
			
 
				-		if (trans)
			
 
				+		if (current->journal_info)
			
 
				 			return -EAGAIN;
			
 
				 		ret = wait_event_interruptible(space_info->wait,
			
 
				 					       !space_info->flush);
			
@@ -3432,9 +3527,9 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	ret = -ENOSPC;
			
 
				-	unused = space_info->bytes_used + space_info->bytes_reserved +
			
 
				-		 space_info->bytes_pinned + space_info->bytes_readonly +
			
 
				-		 space_info->bytes_may_use;
			
 
				+	used = space_info->bytes_used + space_info->bytes_reserved +
			
 
				+		space_info->bytes_pinned + space_info->bytes_readonly +
			
 
				+		space_info->bytes_may_use;
			
 
				 
			
 
				 	/*
			
 
				 	 * The idea here is that we've not already over-reserved the block group
			
@@ -3443,10 +3538,9 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 	 * lets start flushing stuff first and then come back and try to make
			
 
				 	 * our reservation.
			
 
				 	 */
			
 
				-	if (unused <= space_info->total_bytes) {
			
 
				-		unused = space_info->total_bytes - unused;
			
 
				-		if (unused >= num_bytes) {
			
 
				-			space_info->bytes_reserved += orig_bytes;
			
 
				+	if (used <= space_info->total_bytes) {
			
 
				+		if (used + orig_bytes <= space_info->total_bytes) {
			
 
				+			space_info->bytes_may_use += orig_bytes;
			
 
				 			ret = 0;
			
 
				 		} else {
			
 
				 			/*
			
@@ -3462,10 +3556,64 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 		 * amount plus the amount of bytes that we need for this
			
 
				 		 * reservation.
			
 
				 		 */
			
 
				-		num_bytes = unused - space_info->total_bytes +
			
 
				+		wait_ordered = true;
			
 
				+		num_bytes = used - space_info->total_bytes +
			
 
				 			(orig_bytes * (retries + 1));
			
 
				 	}
			
 
				 
			
 
				+	if (ret) {
			
 
				+		u64 profile = btrfs_get_alloc_profile(root, 0);
			
 
				+		u64 avail;
			
 
				+
			
 
				+		/*
			
 
				+		 * If we have a lot of space that's pinned, don't bother doing
			
 
				+		 * the overcommit dance yet and just commit the transaction.
			
 
				+		 */
			
 
				+		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
			
 
				+		do_div(avail, 10);
			
 
				+		if (space_info->bytes_pinned >= avail && flush && !committed) {
			
 
				+			space_info->flush = 1;
			
 
				+			flushing = true;
			
 
				+			spin_unlock(&space_info->lock);
			
 
				+			ret = may_commit_transaction(root, space_info,
			
 
				+						     orig_bytes, 1);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+			committed = true;
			
 
				+			goto again;
			
 
				+		}
			
 
				+
			
 
				+		spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+		avail = root->fs_info->free_chunk_space;
			
 
				+
			
 
				+		/*
			
 
				+		 * If we have dup, raid1 or raid10 then only half of the free
			
 
				+		 * space is actually useable.
			
 
				+		 */
			
 
				+		if (profile & (BTRFS_BLOCK_GROUP_DUP |
			
 
				+			       BTRFS_BLOCK_GROUP_RAID1 |
			
 
				+			       BTRFS_BLOCK_GROUP_RAID10))
			
 
				+			avail >>= 1;
			
 
				+
			
 
				+		/*
			
 
				+		 * If we aren't flushing don't let us overcommit too much, say
			
 
				+		 * 1/8th of the space.  If we can flush, let it overcommit up to
			
 
				+		 * 1/2 of the space.
			
 
				+		 */
			
 
				+		if (flush)
			
 
				+			avail >>= 3;
			
 
				+		else
			
 
				+			avail >>= 1;
			
 
				+		 spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				+
			
 
				+		if (used + num_bytes < space_info->total_bytes + avail) {
			
 
				+			space_info->bytes_may_use += orig_bytes;
			
 
				+			ret = 0;
			
 
				+		} else {
			
 
				+			wait_ordered = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Couldn't make our reservation, save our place so while we're trying
			
 
				 	 * to reclaim space we can actually use it instead of somebody else
			
@@ -3485,7 +3633,7 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 	 * We do synchronous shrinking since we don't actually unreserve
			
 
				 	 * metadata until after the IO is completed.
			
 
				 	 */
			
 
				-	ret = shrink_delalloc(trans, root, num_bytes, 1);
			
 
				+	ret = shrink_delalloc(root, num_bytes, wait_ordered);
			
 
				 	if (ret < 0)
			
 
				 		goto out;
			
 
				 
			
@@ -3497,35 +3645,17 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 	 * so go back around and try again.
			
 
				 	 */
			
 
				 	if (retries < 2) {
			
 
				+		wait_ordered = true;
			
 
				 		retries++;
			
 
				 		goto again;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Not enough space to be reclaimed, don't bother committing the
			
 
				-	 * transaction.
			
 
				-	 */
			
 
				-	spin_lock(&space_info->lock);
			
 
				-	if (space_info->bytes_pinned < orig_bytes)
			
 
				-		ret = -ENOSPC;
			
 
				-	spin_unlock(&space_info->lock);
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				-
			
 
				-	ret = -EAGAIN;
			
 
				-	if (trans)
			
 
				-		goto out;
			
 
				-
			
 
				 	ret = -ENOSPC;
			
 
				 	if (committed)
			
 
				 		goto out;
			
 
				 
			
 
				-	trans = btrfs_join_transaction(root);
			
 
				-	if (IS_ERR(trans))
			
 
				-		goto out;
			
 
				-	ret = btrfs_commit_transaction(trans, root);
			
 
				+	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
			
 
				 	if (!ret) {
			
 
				-		trans = NULL;
			
 
				 		committed = true;
			
 
				 		goto again;
			
 
				 	}
			
@@ -3543,10 +3673,12 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 
				 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
			
 
				 					     struct btrfs_root *root)
			
 
				 {
			
 
				-	struct btrfs_block_rsv *block_rsv;
			
 
				-	if (root->ref_cows)
			
 
				+	struct btrfs_block_rsv *block_rsv = NULL;
			
 
				+
			
 
				+	if (root->ref_cows || root == root->fs_info->csum_root)
			
 
				 		block_rsv = trans->block_rsv;
			
 
				-	else
			
 
				+
			
 
				+	if (!block_rsv)
			
 
				 		block_rsv = root->block_rsv;
			
 
				 
			
 
				 	if (!block_rsv)
			
@@ -3617,7 +3749,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 
				 		}
			
 
				 		if (num_bytes) {
			
 
				 			spin_lock(&space_info->lock);
			
 
				-			space_info->bytes_reserved -= num_bytes;
			
 
				+			space_info->bytes_may_use -= num_bytes;
			
 
				 			space_info->reservation_progress++;
			
 
				 			spin_unlock(&space_info->lock);
			
 
				 		}
			
@@ -3641,9 +3773,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 
				 {
			
 
				 	memset(rsv, 0, sizeof(*rsv));
			
 
				 	spin_lock_init(&rsv->lock);
			
 
				-	atomic_set(&rsv->usage, 1);
			
 
				-	rsv->priority = 6;
			
 
				-	INIT_LIST_HEAD(&rsv->list);
			
 
				 }
			
 
				 
			
 
				 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
			
@@ -3664,38 +3793,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 
				 void btrfs_free_block_rsv(struct btrfs_root *root,
			
 
				 			  struct btrfs_block_rsv *rsv)
			
 
				 {
			
 
				-	if (rsv && atomic_dec_and_test(&rsv->usage)) {
			
 
				-		btrfs_block_rsv_release(root, rsv, (u64)-1);
			
 
				-		if (!rsv->durable)
			
 
				-			kfree(rsv);
			
 
				-	}
			
 
				+	btrfs_block_rsv_release(root, rsv, (u64)-1);
			
 
				+	kfree(rsv);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * make the block_rsv struct be able to capture freed space.
			
 
				- * the captured space will re-add to the the block_rsv struct
			
 
				- * after transaction commit
			
 
				- */
			
 
				-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
			
 
				-				 struct btrfs_block_rsv *block_rsv)
			
 
				+int btrfs_block_rsv_add(struct btrfs_root *root,
			
 
				+			struct btrfs_block_rsv *block_rsv,
			
 
				+			u64 num_bytes)
			
 
				 {
			
 
				-	block_rsv->durable = 1;
			
 
				-	mutex_lock(&fs_info->durable_block_rsv_mutex);
			
 
				-	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
			
 
				-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
			
 
				+	int ret;
			
 
				+
			
 
				+	if (num_bytes == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
			
 
				+	if (!ret) {
			
 
				+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
			
 
				-			struct btrfs_root *root,
			
 
				-			struct btrfs_block_rsv *block_rsv,
			
 
				-			u64 num_bytes)
			
 
				+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
			
 
				+				struct btrfs_block_rsv *block_rsv,
			
 
				+				u64 num_bytes)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				 	if (num_bytes == 0)
			
 
				 		return 0;
			
 
				 
			
 
				-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
			
 
				+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
			
 
				 	if (!ret) {
			
 
				 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
			
 
				 		return 0;
			
@@ -3704,55 +3833,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
			
 
				-			  struct btrfs_root *root,
			
 
				-			  struct btrfs_block_rsv *block_rsv,
			
 
				-			  u64 min_reserved, int min_factor)
			
 
				+int btrfs_block_rsv_check(struct btrfs_root *root,
			
 
				+			  struct btrfs_block_rsv *block_rsv, int min_factor)
			
 
				 {
			
 
				 	u64 num_bytes = 0;
			
 
				-	int commit_trans = 0;
			
 
				 	int ret = -ENOSPC;
			
 
				 
			
 
				 	if (!block_rsv)
			
 
				 		return 0;
			
 
				 
			
 
				 	spin_lock(&block_rsv->lock);
			
 
				-	if (min_factor > 0)
			
 
				-		num_bytes = div_factor(block_rsv->size, min_factor);
			
 
				-	if (min_reserved > num_bytes)
			
 
				-		num_bytes = min_reserved;
			
 
				+	num_bytes = div_factor(block_rsv->size, min_factor);
			
 
				+	if (block_rsv->reserved >= num_bytes)
			
 
				+		ret = 0;
			
 
				+	spin_unlock(&block_rsv->lock);
			
 
				 
			
 
				-	if (block_rsv->reserved >= num_bytes) {
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int btrfs_block_rsv_refill(struct btrfs_root *root,
			
 
				+			  struct btrfs_block_rsv *block_rsv,
			
 
				+			  u64 min_reserved)
			
 
				+{
			
 
				+	u64 num_bytes = 0;
			
 
				+	int ret = -ENOSPC;
			
 
				+
			
 
				+	if (!block_rsv)
			
 
				+		return 0;
			
 
				+
			
 
				+	spin_lock(&block_rsv->lock);
			
 
				+	num_bytes = min_reserved;
			
 
				+	if (block_rsv->reserved >= num_bytes)
			
 
				 		ret = 0;
			
 
				-	} else {
			
 
				+	else
			
 
				 		num_bytes -= block_rsv->reserved;
			
 
				-		if (block_rsv->durable &&
			
 
				-		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
			
 
				-			commit_trans = 1;
			
 
				-	}
			
 
				 	spin_unlock(&block_rsv->lock);
			
 
				+
			
 
				 	if (!ret)
			
 
				 		return 0;
			
 
				 
			
 
				-	if (block_rsv->refill_used) {
			
 
				-		ret = reserve_metadata_bytes(trans, root, block_rsv,
			
 
				-					     num_bytes, 0);
			
 
				-		if (!ret) {
			
 
				-			block_rsv_add_bytes(block_rsv, num_bytes, 0);
			
 
				-			return 0;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if (commit_trans) {
			
 
				-		if (trans)
			
 
				-			return -EAGAIN;
			
 
				-		trans = btrfs_join_transaction(root);
			
 
				-		BUG_ON(IS_ERR(trans));
			
 
				-		ret = btrfs_commit_transaction(trans, root);
			
 
				+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
			
 
				+	if (!ret) {
			
 
				+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	return -ENOSPC;
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
			
@@ -3784,7 +3910,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
				 	u64 num_bytes;
			
 
				 	u64 meta_used;
			
 
				 	u64 data_used;
			
 
				-	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
			
 
				+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
			
 
				 
			
 
				 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
			
 
				 	spin_lock(&sinfo->lock);
			
@@ -3828,12 +3954,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
				 	if (sinfo->total_bytes > num_bytes) {
			
 
				 		num_bytes = sinfo->total_bytes - num_bytes;
			
 
				 		block_rsv->reserved += num_bytes;
			
 
				-		sinfo->bytes_reserved += num_bytes;
			
 
				+		sinfo->bytes_may_use += num_bytes;
			
 
				 	}
			
 
				 
			
 
				 	if (block_rsv->reserved >= block_rsv->size) {
			
 
				 		num_bytes = block_rsv->reserved - block_rsv->size;
			
 
				-		sinfo->bytes_reserved -= num_bytes;
			
 
				+		sinfo->bytes_may_use -= num_bytes;
			
 
				 		sinfo->reservation_progress++;
			
 
				 		block_rsv->reserved = block_rsv->size;
			
 
				 		block_rsv->full = 1;
			
@@ -3849,16 +3975,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
				 
			
 
				 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
			
 
				 	fs_info->chunk_block_rsv.space_info = space_info;
			
 
				-	fs_info->chunk_block_rsv.priority = 10;
			
 
				 
			
 
				 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
			
 
				 	fs_info->global_block_rsv.space_info = space_info;
			
 
				-	fs_info->global_block_rsv.priority = 10;
			
 
				-	fs_info->global_block_rsv.refill_used = 1;
			
 
				 	fs_info->delalloc_block_rsv.space_info = space_info;
			
 
				 	fs_info->trans_block_rsv.space_info = space_info;
			
 
				 	fs_info->empty_block_rsv.space_info = space_info;
			
 
				-	fs_info->empty_block_rsv.priority = 10;
			
 
				+	fs_info->delayed_block_rsv.space_info = space_info;
			
 
				 
			
 
				 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
			
 
				 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
			
@@ -3866,10 +3989,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
				 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
			
 
				 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
			
 
				 
			
 
				-	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
			
 
				-
			
 
				-	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
			
 
				-
			
 
				 	update_global_block_rsv(fs_info);
			
 
				 }
			
 
				 
			
@@ -3882,37 +4001,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 
				 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
			
 
				 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
			
 
				 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
			
 
				-}
			
 
				-
			
 
				-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
			
 
				-				    struct btrfs_root *root,
			
 
				-				    struct btrfs_block_rsv *rsv)
			
 
				-{
			
 
				-	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
			
 
				-	u64 num_bytes;
			
 
				-	int ret;
			
 
				-
			
 
				-	/*
			
 
				-	 * Truncate should be freeing data, but give us 2 items just in case it
			
 
				-	 * needs to use some space.  We may want to be smarter about this in the
			
 
				-	 * future.
			
 
				-	 */
			
 
				-	num_bytes = btrfs_calc_trans_metadata_size(root, 2);
			
 
				-
			
 
				-	/* We already have enough bytes, just return */
			
 
				-	if (rsv->reserved >= num_bytes)
			
 
				-		return 0;
			
 
				-
			
 
				-	num_bytes -= rsv->reserved;
			
 
				-
			
 
				-	/*
			
 
				-	 * You should have reserved enough space before hand to do this, so this
			
 
				-	 * should not fail.
			
 
				-	 */
			
 
				-	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
			
 
				-	BUG_ON(ret);
			
 
				-
			
 
				-	return 0;
			
 
				+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
			
 
				+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
			
 
				 }
			
 
				 
			
 
				 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
			
@@ -3921,9 +4011,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 
				 	if (!trans->bytes_reserved)
			
 
				 		return;
			
 
				 
			
 
				-	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
			
 
				-	btrfs_block_rsv_release(root, trans->block_rsv,
			
 
				-				trans->bytes_reserved);
			
 
				+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
			
 
				 	trans->bytes_reserved = 0;
			
 
				 }
			
 
				 
			
@@ -3965,11 +4053,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 
				 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * drop_outstanding_extent - drop an outstanding extent
			
 
				+ * @inode: the inode we're dropping the extent for
			
 
				+ *
			
 
				+ * This is called when we are freeing up an outstanding extent, either called
			
 
				+ * after an error or after an extent is written.  This will return the number of
			
 
				+ * reserved extents that need to be freed.  This must be called with
			
 
				+ * BTRFS_I(inode)->lock held.
			
 
				+ */
			
 
				 static unsigned drop_outstanding_extent(struct inode *inode)
			
 
				 {
			
 
				 	unsigned dropped_extents = 0;
			
 
				 
			
 
				-	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
			
 
				 	BTRFS_I(inode)->outstanding_extents--;
			
 
				 
			
@@ -3979,19 +4075,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
 
				 	 */
			
 
				 	if (BTRFS_I(inode)->outstanding_extents >=
			
 
				 	    BTRFS_I(inode)->reserved_extents)
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 
			
 
				 	dropped_extents = BTRFS_I(inode)->reserved_extents -
			
 
				 		BTRFS_I(inode)->outstanding_extents;
			
 
				 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
			
 
				-out:
			
 
				-	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				 	return dropped_extents;
			
 
				 }
			
 
				 
			
 
				-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
			
 
				+/**
			
 
				+ * calc_csum_metadata_size - return the amount of metada space that must be
			
 
				+ *	reserved/free'd for the given bytes.
			
 
				+ * @inode: the inode we're manipulating
			
 
				+ * @num_bytes: the number of bytes in question
			
 
				+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
			
 
				+ *
			
 
				+ * This adjusts the number of csum_bytes in the inode and then returns the
			
 
				+ * correct amount of metadata that must either be reserved or freed.  We
			
 
				+ * calculate how many checksums we can fit into one leaf and then divide the
			
 
				+ * number of bytes that will need to be checksumed by this value to figure out
			
 
				+ * how many checksums will be required.  If we are adding bytes then the number
			
 
				+ * may go up and we will return the number of additional bytes that must be
			
 
				+ * reserved.  If it is going down we will return the number of bytes that must
			
 
				+ * be freed.
			
 
				+ *
			
 
				+ * This must be called with BTRFS_I(inode)->lock held.
			
 
				+ */
			
 
				+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
			
 
				+				   int reserve)
			
 
				 {
			
 
				-	return num_bytes >>= 3;
			
 
				+	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+	u64 csum_size;
			
 
				+	int num_csums_per_leaf;
			
 
				+	int num_csums;
			
 
				+	int old_csums;
			
 
				+
			
 
				+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
			
 
				+	    BTRFS_I(inode)->csum_bytes == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
			
 
				+	if (reserve)
			
 
				+		BTRFS_I(inode)->csum_bytes += num_bytes;
			
 
				+	else
			
 
				+		BTRFS_I(inode)->csum_bytes -= num_bytes;
			
 
				+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
			
 
				+	num_csums_per_leaf = (int)div64_u64(csum_size,
			
 
				+					    sizeof(struct btrfs_csum_item) +
			
 
				+					    sizeof(struct btrfs_disk_key));
			
 
				+	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
			
 
				+	num_csums = num_csums + num_csums_per_leaf - 1;
			
 
				+	num_csums = num_csums / num_csums_per_leaf;
			
 
				+
			
 
				+	old_csums = old_csums + num_csums_per_leaf - 1;
			
 
				+	old_csums = old_csums / num_csums_per_leaf;
			
 
				+
			
 
				+	/* No change, no need to reserve more */
			
 
				+	if (old_csums == num_csums)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (reserve)
			
 
				+		return btrfs_calc_trans_metadata_size(root,
			
 
				+						      num_csums - old_csums);
			
 
				+
			
 
				+	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
			
 
				 }
			
 
				 
			
 
				 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
			
@@ -4000,9 +4147,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				 	u64 to_reserve = 0;
			
 
				 	unsigned nr_extents = 0;
			
 
				+	int flush = 1;
			
 
				 	int ret;
			
 
				 
			
 
				-	if (btrfs_transaction_in_commit(root->fs_info))
			
 
				+	if (btrfs_is_free_space_inode(root, inode))
			
 
				+		flush = 0;
			
 
				+
			
 
				+	if (flush && btrfs_transaction_in_commit(root->fs_info))
			
 
				 		schedule_timeout(1);
			
 
				 
			
 
				 	num_bytes = ALIGN(num_bytes, root->sectorsize);
			
@@ -4018,18 +4169,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 
			
 
				 		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
			
 
				 	}
			
 
				+	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
			
 
				 	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				 
			
 
				-	to_reserve += calc_csum_metadata_size(inode, num_bytes);
			
 
				-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
			
 
				+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
			
 
				 	if (ret) {
			
 
				+		u64 to_free = 0;
			
 
				 		unsigned dropped;
			
 
				+
			
 
				+		spin_lock(&BTRFS_I(inode)->lock);
			
 
				+		dropped = drop_outstanding_extent(inode);
			
 
				+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
			
 
				+		spin_unlock(&BTRFS_I(inode)->lock);
			
 
				+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
			
 
				+
			
 
				 		/*
			
 
				-		 * We don't need the return value since our reservation failed,
			
 
				-		 * we just need to clean up our counter.
			
 
				+		 * Somebody could have come in and twiddled with the
			
 
				+		 * reservation, so if we have to free more than we would have
			
 
				+		 * reserved from this reservation go ahead and release those
			
 
				+		 * bytes.
			
 
				 		 */
			
 
				-		dropped = drop_outstanding_extent(inode);
			
 
				-		WARN_ON(dropped > 1);
			
 
				+		to_free -= to_reserve;
			
 
				+		if (to_free)
			
 
				+			btrfs_block_rsv_release(root, block_rsv, to_free);
			
 
				 		return ret;
			
 
				 	}
			
 
				 
			
@@ -4038,6 +4200,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
			
 
				+ * @inode: the inode to release the reservation for
			
 
				+ * @num_bytes: the number of bytes we're releasing
			
 
				+ *
			
 
				+ * This will release the metadata reservation for an inode.  This can be called
			
 
				+ * once we complete IO for a given set of bytes to release their metadata
			
 
				+ * reservations.
			
 
				+ */
			
 
				 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
@@ -4045,9 +4216,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
				 	unsigned dropped;
			
 
				 
			
 
				 	num_bytes = ALIGN(num_bytes, root->sectorsize);
			
 
				+	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	dropped = drop_outstanding_extent(inode);
			
 
				 
			
 
				-	to_free = calc_csum_metadata_size(inode, num_bytes);
			
 
				+	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
			
 
				+	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				 	if (dropped > 0)
			
 
				 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
			
 
				 
			
@@ -4055,6 +4228,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
				 				to_free);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
			
 
				+ * @inode: inode we're writing to
			
 
				+ * @num_bytes: the number of bytes we want to allocate
			
 
				+ *
			
 
				+ * This will do the following things
			
 
				+ *
			
 
				+ * o reserve space in the data space info for num_bytes
			
 
				+ * o reserve space in the metadata space info based on number of outstanding
			
 
				+ *   extents and how much csums will be needed
			
 
				+ * o add to the inodes ->delalloc_bytes
			
 
				+ * o add it to the fs_info's delalloc inodes list.
			
 
				+ *
			
 
				+ * This will return 0 for success and -ENOSPC if there is no space left.
			
 
				+ */
			
 
				 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
			
 
				 {
			
 
				 	int ret;
			
@@ -4072,6 +4260,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
			
 
				+ * @inode: inode we're releasing space for
			
 
				+ * @num_bytes: the number of bytes we want to free up
			
 
				+ *
			
 
				+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
			
 
				+ * called in the case that we don't need the metadata AND data reservations
			
 
				+ * anymore.  So if there is an error or we insert an inline extent.
			
 
				+ *
			
 
				+ * This function will release the metadata space that was not used and will
			
 
				+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
			
 
				+ * list if there are no delalloc bytes left.
			
 
				+ */
			
 
				 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
			
 
				 {
			
 
				 	btrfs_delalloc_release_metadata(inode, num_bytes);
			
@@ -4091,12 +4292,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	/* block accounting for super block */
			
 
				 	spin_lock(&info->delalloc_lock);
			
 
				-	old_val = btrfs_super_bytes_used(&info->super_copy);
			
 
				+	old_val = btrfs_super_bytes_used(info->super_copy);
			
 
				 	if (alloc)
			
 
				 		old_val += num_bytes;
			
 
				 	else
			
 
				 		old_val -= num_bytes;
			
 
				-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
			
 
				+	btrfs_set_super_bytes_used(info->super_copy, old_val);
			
 
				 	spin_unlock(&info->delalloc_lock);
			
 
				 
			
 
				 	while (total) {
			
@@ -4124,7 +4325,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
				 		spin_lock(&cache->space_info->lock);
			
 
				 		spin_lock(&cache->lock);
			
 
				 
			
 
				-		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
			
 
				+		if (btrfs_test_opt(root, SPACE_CACHE) &&
			
 
				 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
			
 
				 			cache->disk_cache_state = BTRFS_DC_CLEAR;
			
 
				 
			
@@ -4136,7 +4337,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
				 			btrfs_set_block_group_used(&cache->item, old_val);
			
 
				 			cache->reserved -= num_bytes;
			
 
				 			cache->space_info->bytes_reserved -= num_bytes;
			
 
				-			cache->space_info->reservation_progress++;
			
 
				 			cache->space_info->bytes_used += num_bytes;
			
 
				 			cache->space_info->disk_used += num_bytes * factor;
			
 
				 			spin_unlock(&cache->lock);
			
@@ -4188,7 +4388,6 @@ static int pin_down_extent(struct btrfs_root *root,
 
				 	if (reserved) {
			
 
				 		cache->reserved -= num_bytes;
			
 
				 		cache->space_info->bytes_reserved -= num_bytes;
			
 
				-		cache->space_info->reservation_progress++;
			
 
				 	}
			
 
				 	spin_unlock(&cache->lock);
			
 
				 	spin_unlock(&cache->space_info->lock);
			
@@ -4216,45 +4415,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * update size of reserved extents. this function may return -EAGAIN
			
 
				- * if 'reserve' is true or 'sinfo' is false.
			
 
				+ * this function must be called within transaction
			
 
				+ */
			
 
				+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
			
 
				+				    struct btrfs_root *root,
			
 
				+				    u64 bytenr, u64 num_bytes)
			
 
				+{
			
 
				+	struct btrfs_block_group_cache *cache;
			
 
				+
			
 
				+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
			
 
				+	BUG_ON(!cache);
			
 
				+
			
 
				+	/*
			
 
				+	 * pull in the free space cache (if any) so that our pin
			
 
				+	 * removes the free space from the cache.  We have load_only set
			
 
				+	 * to one because the slow code to read in the free extents does check
			
 
				+	 * the pinned extents.
			
 
				+	 */
			
 
				+	cache_block_group(cache, trans, root, 1);
			
 
				+
			
 
				+	pin_down_extent(root, cache, bytenr, num_bytes, 0);
			
 
				+
			
 
				+	/* remove us from the free space cache (if we're there at all) */
			
 
				+	btrfs_remove_free_space(cache, bytenr, num_bytes);
			
 
				+	btrfs_put_block_group(cache);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * btrfs_update_reserved_bytes - update the block_group and space info counters
			
 
				+ * @cache:	The cache we are manipulating
			
 
				+ * @num_bytes:	The number of bytes in question
			
 
				+ * @reserve:	One of the reservation enums
			
 
				+ *
			
 
				+ * This is called by the allocator when it reserves space, or by somebody who is
			
 
				+ * freeing space that was never actually used on disk.  For example if you
			
 
				+ * reserve some space for a new leaf in transaction A and before transaction A
			
 
				+ * commits you free that leaf, you call this with reserve set to 0 in order to
			
 
				+ * clear the reservation.
			
 
				+ *
			
 
				+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
			
 
				+ * ENOSPC accounting.  For data we handle the reservation through clearing the
			
 
				+ * delalloc bits in the io_tree.  We have to do this since we could end up
			
 
				+ * allocating less disk space for the amount of data we have reserved in the
			
 
				+ * case of compression.
			
 
				+ *
			
 
				+ * If this is a reservation and the block group has become read only we cannot
			
 
				+ * make the reservation and return -EAGAIN, otherwise this function always
			
 
				+ * succeeds.
			
 
				  */
			
 
				-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
			
 
				-				u64 num_bytes, int reserve, int sinfo)
			
 
				+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
			
 
				+				       u64 num_bytes, int reserve)
			
 
				 {
			
 
				+	struct btrfs_space_info *space_info = cache->space_info;
			
 
				 	int ret = 0;
			
 
				-	if (sinfo) {
			
 
				-		struct btrfs_space_info *space_info = cache->space_info;
			
 
				-		spin_lock(&space_info->lock);
			
 
				-		spin_lock(&cache->lock);
			
 
				-		if (reserve) {
			
 
				-			if (cache->ro) {
			
 
				-				ret = -EAGAIN;
			
 
				-			} else {
			
 
				-				cache->reserved += num_bytes;
			
 
				-				space_info->bytes_reserved += num_bytes;
			
 
				-			}
			
 
				-		} else {
			
 
				-			if (cache->ro)
			
 
				-				space_info->bytes_readonly += num_bytes;
			
 
				-			cache->reserved -= num_bytes;
			
 
				-			space_info->bytes_reserved -= num_bytes;
			
 
				-			space_info->reservation_progress++;
			
 
				-		}
			
 
				-		spin_unlock(&cache->lock);
			
 
				-		spin_unlock(&space_info->lock);
			
 
				-	} else {
			
 
				-		spin_lock(&cache->lock);
			
 
				+	spin_lock(&space_info->lock);
			
 
				+	spin_lock(&cache->lock);
			
 
				+	if (reserve != RESERVE_FREE) {
			
 
				 		if (cache->ro) {
			
 
				 			ret = -EAGAIN;
			
 
				 		} else {
			
 
				-			if (reserve)
			
 
				-				cache->reserved += num_bytes;
			
 
				-			else
			
 
				-				cache->reserved -= num_bytes;
			
 
				+			cache->reserved += num_bytes;
			
 
				+			space_info->bytes_reserved += num_bytes;
			
 
				+			if (reserve == RESERVE_ALLOC) {
			
 
				+				BUG_ON(space_info->bytes_may_use < num_bytes);
			
 
				+				space_info->bytes_may_use -= num_bytes;
			
 
				+			}
			
 
				 		}
			
 
				-		spin_unlock(&cache->lock);
			
 
				+	} else {
			
 
				+		if (cache->ro)
			
 
				+			space_info->bytes_readonly += num_bytes;
			
 
				+		cache->reserved -= num_bytes;
			
 
				+		space_info->bytes_reserved -= num_bytes;
			
 
				+		space_info->reservation_progress++;
			
 
				 	}
			
 
				+	spin_unlock(&cache->lock);
			
 
				+	spin_unlock(&space_info->lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -4320,13 +4556,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 
				 		spin_lock(&cache->lock);
			
 
				 		cache->pinned -= len;
			
 
				 		cache->space_info->bytes_pinned -= len;
			
 
				-		if (cache->ro) {
			
 
				+		if (cache->ro)
			
 
				 			cache->space_info->bytes_readonly += len;
			
 
				-		} else if (cache->reserved_pinned > 0) {
			
 
				-			len = min(len, cache->reserved_pinned);
			
 
				-			cache->reserved_pinned -= len;
			
 
				-			cache->space_info->bytes_reserved += len;
			
 
				-		}
			
 
				 		spin_unlock(&cache->lock);
			
 
				 		spin_unlock(&cache->space_info->lock);
			
 
				 	}
			
@@ -4341,11 +4572,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
				 {
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	struct extent_io_tree *unpin;
			
 
				-	struct btrfs_block_rsv *block_rsv;
			
 
				-	struct btrfs_block_rsv *next_rsv;
			
 
				 	u64 start;
			
 
				 	u64 end;
			
 
				-	int idx;
			
 
				 	int ret;
			
 
				 
			
 
				 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
			
@@ -4368,30 +4596,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
				 		cond_resched();
			
 
				 	}
			
 
				 
			
 
				-	mutex_lock(&fs_info->durable_block_rsv_mutex);
			
 
				-	list_for_each_entry_safe(block_rsv, next_rsv,
			
 
				-				 &fs_info->durable_block_rsv_list, list) {
			
 
				-
			
 
				-		idx = trans->transid & 0x1;
			
 
				-		if (block_rsv->freed[idx] > 0) {
			
 
				-			block_rsv_add_bytes(block_rsv,
			
 
				-					    block_rsv->freed[idx], 0);
			
 
				-			block_rsv->freed[idx] = 0;
			
 
				-		}
			
 
				-		if (atomic_read(&block_rsv->usage) == 0) {
			
 
				-			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
			
 
				-
			
 
				-			if (block_rsv->freed[0] == 0 &&
			
 
				-			    block_rsv->freed[1] == 0) {
			
 
				-				list_del_init(&block_rsv->list);
			
 
				-				kfree(block_rsv);
			
 
				-			}
			
 
				-		} else {
			
 
				-			btrfs_block_rsv_release(root, block_rsv, 0);
			
 
				-		}
			
 
				-	}
			
 
				-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -4669,7 +4873,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
				 			   struct extent_buffer *buf,
			
 
				 			   u64 parent, int last_ref)
			
 
				 {
			
 
				-	struct btrfs_block_rsv *block_rsv;
			
 
				 	struct btrfs_block_group_cache *cache = NULL;
			
 
				 	int ret;
			
 
				 
			
@@ -4684,64 +4887,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
				 	if (!last_ref)
			
 
				 		return;
			
 
				 
			
 
				-	block_rsv = get_block_rsv(trans, root);
			
 
				 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
			
 
				-	if (block_rsv->space_info != cache->space_info)
			
 
				-		goto out;
			
 
				 
			
 
				 	if (btrfs_header_generation(buf) == trans->transid) {
			
 
				 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
			
 
				 			ret = check_ref_cleanup(trans, root, buf->start);
			
 
				 			if (!ret)
			
 
				-				goto pin;
			
 
				+				goto out;
			
 
				 		}
			
 
				 
			
 
				 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
			
 
				 			pin_down_extent(root, cache, buf->start, buf->len, 1);
			
 
				-			goto pin;
			
 
				+			goto out;
			
 
				 		}
			
 
				 
			
 
				 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
			
 
				 
			
 
				 		btrfs_add_free_space(cache, buf->start, buf->len);
			
 
				-		ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
			
 
				-		if (ret == -EAGAIN) {
			
 
				-			/* block group became read-only */
			
 
				-			btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
			
 
				-			goto out;
			
 
				-		}
			
 
				-
			
 
				-		ret = 1;
			
 
				-		spin_lock(&block_rsv->lock);
			
 
				-		if (block_rsv->reserved < block_rsv->size) {
			
 
				-			block_rsv->reserved += buf->len;
			
 
				-			ret = 0;
			
 
				-		}
			
 
				-		spin_unlock(&block_rsv->lock);
			
 
				-
			
 
				-		if (ret) {
			
 
				-			spin_lock(&cache->space_info->lock);
			
 
				-			cache->space_info->bytes_reserved -= buf->len;
			
 
				-			cache->space_info->reservation_progress++;
			
 
				-			spin_unlock(&cache->space_info->lock);
			
 
				-		}
			
 
				-		goto out;
			
 
				-	}
			
 
				-pin:
			
 
				-	if (block_rsv->durable && !cache->ro) {
			
 
				-		ret = 0;
			
 
				-		spin_lock(&cache->lock);
			
 
				-		if (!cache->ro) {
			
 
				-			cache->reserved_pinned += buf->len;
			
 
				-			ret = 1;
			
 
				-		}
			
 
				-		spin_unlock(&cache->lock);
			
 
				-
			
 
				-		if (ret) {
			
 
				-			spin_lock(&block_rsv->lock);
			
 
				-			block_rsv->freed[trans->transid & 0x1] += buf->len;
			
 
				-			spin_unlock(&block_rsv->lock);
			
 
				-		}
			
 
				+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
			
 
				 	}
			
 
				 out:
			
 
				 	/*
			
@@ -4884,10 +5047,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
				 	int last_ptr_loop = 0;
			
 
				 	int loop = 0;
			
 
				 	int index = 0;
			
 
				+	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
			
 
				+		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
			
 
				 	bool found_uncached_bg = false;
			
 
				 	bool failed_cluster_refill = false;
			
 
				 	bool failed_alloc = false;
			
 
				 	bool use_cluster = true;
			
 
				+	bool have_caching_bg = false;
			
 
				 	u64 ideal_cache_percent = 0;
			
 
				 	u64 ideal_cache_offset = 0;
			
 
				 
			
@@ -4970,6 +5136,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
				 		}
			
 
				 	}
			
 
				 search:
			
 
				+	have_caching_bg = false;
			
 
				 	down_read(&space_info->groups_sem);
			
 
				 	list_for_each_entry(block_group, &space_info->block_groups[index],
			
 
				 			    list) {
			
@@ -5178,6 +5345,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
				 			failed_alloc = true;
			
 
				 			goto have_block_group;
			
 
				 		} else if (!offset) {
			
 
				+			if (!cached)
			
 
				+				have_caching_bg = true;
			
 
				 			goto loop;
			
 
				 		}
			
 
				 checks:
			
@@ -5203,8 +5372,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
				 					     search_start - offset);
			
 
				 		BUG_ON(offset > search_start);
			
 
				 
			
 
				-		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
			
 
				-					    (data & BTRFS_BLOCK_GROUP_DATA));
			
 
				+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
			
 
				+						  alloc_type);
			
 
				 		if (ret == -EAGAIN) {
			
 
				 			btrfs_add_free_space(block_group, offset, num_bytes);
			
 
				 			goto loop;
			
@@ -5228,6 +5397,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 	up_read(&space_info->groups_sem);
			
 
				 
			
 
				+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
			
 
				+		goto search;
			
 
				+
			
 
				 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
			
 
				 		goto search;
			
 
				 
			
@@ -5326,7 +5498,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 
				 	int index = 0;
			
 
				 
			
 
				 	spin_lock(&info->lock);
			
 
				-	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
			
 
				+	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
			
 
				+	       (unsigned long long)info->flags,
			
 
				 	       (unsigned long long)(info->total_bytes - info->bytes_used -
			
 
				 				    info->bytes_pinned - info->bytes_reserved -
			
 
				 				    info->bytes_readonly),
			
@@ -5412,7 +5585,8 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
			
 
				+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
			
 
				+					u64 start, u64 len, int pin)
			
 
				 {
			
 
				 	struct btrfs_block_group_cache *cache;
			
 
				 	int ret = 0;
			
@@ -5427,8 +5601,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 
				 	if (btrfs_test_opt(root, DISCARD))
			
 
				 		ret = btrfs_discard_extent(root, start, len, NULL);
			
 
				 
			
 
				-	btrfs_add_free_space(cache, start, len);
			
 
				-	btrfs_update_reserved_bytes(cache, len, 0, 1);
			
 
				+	if (pin)
			
 
				+		pin_down_extent(root, cache, start, len, 1);
			
 
				+	else {
			
 
				+		btrfs_add_free_space(cache, start, len);
			
 
				+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
			
 
				+	}
			
 
				 	btrfs_put_block_group(cache);
			
 
				 
			
 
				 	trace_btrfs_reserved_extent_free(root, start, len);
			
@@ -5436,6 +5614,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int btrfs_free_reserved_extent(struct btrfs_root *root,
			
 
				+					u64 start, u64 len)
			
 
				+{
			
 
				+	return __btrfs_free_reserved_extent(root, start, len, 0);
			
 
				+}
			
 
				+
			
 
				+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
			
 
				+				       u64 start, u64 len)
			
 
				+{
			
 
				+	return __btrfs_free_reserved_extent(root, start, len, 1);
			
 
				+}
			
 
				+
			
 
				 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
			
 
				 				      struct btrfs_root *root,
			
 
				 				      u64 parent, u64 root_objectid,
			
@@ -5631,7 +5821,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
				 		put_caching_control(caching_ctl);
			
 
				 	}
			
 
				 
			
 
				-	ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
			
 
				+	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
			
 
				+					  RESERVE_ALLOC_NO_ACCOUNT);
			
 
				 	BUG_ON(ret);
			
 
				 	btrfs_put_block_group(block_group);
			
 
				 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
			
@@ -5688,8 +5879,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 
				 	block_rsv = get_block_rsv(trans, root);
			
 
				 
			
 
				 	if (block_rsv->size == 0) {
			
 
				-		ret = reserve_metadata_bytes(trans, root, block_rsv,
			
 
				-					     blocksize, 0);
			
 
				+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
			
 
				 		/*
			
 
				 		 * If we couldn't reserve metadata bytes try and use some from
			
 
				 		 * the global reserve.
			
@@ -5709,13 +5899,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 
				 	if (!ret)
			
 
				 		return block_rsv;
			
 
				 	if (ret) {
			
 
				-		WARN_ON(1);
			
 
				-		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
			
 
				-					     0);
			
 
				+		static DEFINE_RATELIMIT_STATE(_rs,
			
 
				+				DEFAULT_RATELIMIT_INTERVAL,
			
 
				+				/*DEFAULT_RATELIMIT_BURST*/ 2);
			
 
				+		if (__ratelimit(&_rs)) {
			
 
				+			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
			
 
				+			WARN_ON(1);
			
 
				+		}
			
 
				+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
			
 
				 		if (!ret) {
			
 
				-			spin_lock(&block_rsv->lock);
			
 
				-			block_rsv->size += blocksize;
			
 
				-			spin_unlock(&block_rsv->lock);
			
 
				 			return block_rsv;
			
 
				 		} else if (ret && block_rsv != global_rsv) {
			
 
				 			ret = block_rsv_use_bytes(global_rsv, blocksize);
			
@@ -6593,12 +6785,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 
				 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
			
 
				 
			
 
				 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
			
 
				-	    sinfo->bytes_may_use + sinfo->bytes_readonly +
			
 
				-	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
			
 
				-	    sinfo->total_bytes) {
			
 
				+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
			
 
				+	    min_allocable_bytes <= sinfo->total_bytes) {
			
 
				 		sinfo->bytes_readonly += num_bytes;
			
 
				-		sinfo->bytes_reserved += cache->reserved_pinned;
			
 
				-		cache->reserved_pinned = 0;
			
 
				 		cache->ro = 1;
			
 
				 		ret = 0;
			
 
				 	}
			
@@ -6965,7 +7154,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
				 					struct btrfs_space_info,
			
 
				 					list);
			
 
				 		if (space_info->bytes_pinned > 0 ||
			
 
				-		    space_info->bytes_reserved > 0) {
			
 
				+		    space_info->bytes_reserved > 0 ||
			
 
				+		    space_info->bytes_may_use > 0) {
			
 
				 			WARN_ON(1);
			
 
				 			dump_space_info(space_info, 0, 0);
			
 
				 		}
			
@@ -7007,14 +7197,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
				 		return -ENOMEM;
			
 
				 	path->reada = 1;
			
 
				 
			
 
				-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
			
 
				-	if (cache_gen != 0 &&
			
 
				-	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
			
 
				+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
			
 
				+	if (btrfs_test_opt(root, SPACE_CACHE) &&
			
 
				+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
			
 
				 		need_clear = 1;
			
 
				 	if (btrfs_test_opt(root, CLEAR_CACHE))
			
 
				 		need_clear = 1;
			
 
				-	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
			
 
				-		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
			
 
				 
			
 
				 	while (1) {
			
 
				 		ret = find_first_block_group(root, path, &key);
			
@@ -7253,7 +7441,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	inode = lookup_free_space_inode(root, block_group, path);
			
 
				+	inode = lookup_free_space_inode(tree_root, block_group, path);
			
 
				 	if (!IS_ERR(inode)) {
			
 
				 		ret = btrfs_orphan_add(trans, inode);
			
 
				 		BUG_ON(ret);
			
@@ -7269,7 +7457,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 			spin_unlock(&block_group->lock);
			
 
				 		}
			
 
				 		/* One for our lookup ref */
			
 
				-		iput(inode);
			
 
				+		btrfs_add_delayed_iput(inode);
			
 
				 	}
			
 
				 
			
 
				 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
			
@@ -7340,7 +7528,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 
				 	int mixed = 0;
			
 
				 	int ret;
			
 
				 
			
 
				-	disk_super = &fs_info->super_copy;
			
 
				+	disk_super = fs_info->super_copy;
			
 
				 	if (!btrfs_super_root(disk_super))
			
 
				 		return 1;
			
 
				 
			
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 
				 #include "compat.h"
			
 
				 #include "ctree.h"
			
 
				 #include "btrfs_inode.h"
			
 
				+#include "volumes.h"
			
 
				 
			
 
				 static struct kmem_cache *extent_state_cache;
			
 
				 static struct kmem_cache *extent_buffer_cache;
			
@@ -894,6 +895,194 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
				 	goto again;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * convert_extent - convert all bits in a given range from one bit to another
			
 
				+ * @tree:	the io tree to search
			
 
				+ * @start:	the start offset in bytes
			
 
				+ * @end:	the end offset in bytes (inclusive)
			
 
				+ * @bits:	the bits to set in this range
			
 
				+ * @clear_bits:	the bits to clear in this range
			
 
				+ * @mask:	the allocation mask
			
 
				+ *
			
 
				+ * This will go through and set bits for the given range.  If any states exist
			
 
				+ * already in this range they are set with the given bit and cleared of the
			
 
				+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
			
 
				+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
			
 
				+ * boundary bits like LOCK.
			
 
				+ */
			
 
				+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				+		       int bits, int clear_bits, gfp_t mask)
			
 
				+{
			
 
				+	struct extent_state *state;
			
 
				+	struct extent_state *prealloc = NULL;
			
 
				+	struct rb_node *node;
			
 
				+	int err = 0;
			
 
				+	u64 last_start;
			
 
				+	u64 last_end;
			
 
				+
			
 
				+again:
			
 
				+	if (!prealloc && (mask & __GFP_WAIT)) {
			
 
				+		prealloc = alloc_extent_state(mask);
			
 
				+		if (!prealloc)
			
 
				+			return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&tree->lock);
			
 
				+	/*
			
 
				+	 * this search will find all the extents that end after
			
 
				+	 * our range starts.
			
 
				+	 */
			
 
				+	node = tree_search(tree, start);
			
 
				+	if (!node) {
			
 
				+		prealloc = alloc_extent_state_atomic(prealloc);
			
 
				+		if (!prealloc)
			
 
				+			return -ENOMEM;
			
 
				+		err = insert_state(tree, prealloc, start, end, &bits);
			
 
				+		prealloc = NULL;
			
 
				+		BUG_ON(err == -EEXIST);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	state = rb_entry(node, struct extent_state, rb_node);
			
 
				+hit_next:
			
 
				+	last_start = state->start;
			
 
				+	last_end = state->end;
			
 
				+
			
 
				+	/*
			
 
				+	 * | ---- desired range ---- |
			
 
				+	 * | state |
			
 
				+	 *
			
 
				+	 * Just lock what we found and keep going
			
 
				+	 */
			
 
				+	if (state->start == start && state->end <= end) {
			
 
				+		struct rb_node *next_node;
			
 
				+
			
 
				+		set_state_bits(tree, state, &bits);
			
 
				+		clear_state_bit(tree, state, &clear_bits, 0);
			
 
				+
			
 
				+		merge_state(tree, state);
			
 
				+		if (last_end == (u64)-1)
			
 
				+			goto out;
			
 
				+
			
 
				+		start = last_end + 1;
			
 
				+		next_node = rb_next(&state->rb_node);
			
 
				+		if (next_node && start < end && prealloc && !need_resched()) {
			
 
				+			state = rb_entry(next_node, struct extent_state,
			
 
				+					 rb_node);
			
 
				+			if (state->start == start)
			
 
				+				goto hit_next;
			
 
				+		}
			
 
				+		goto search_again;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 *     | ---- desired range ---- |
			
 
				+	 * | state |
			
 
				+	 *   or
			
 
				+	 * | ------------- state -------------- |
			
 
				+	 *
			
 
				+	 * We need to split the extent we found, and may flip bits on
			
 
				+	 * second half.
			
 
				+	 *
			
 
				+	 * If the extent we found extends past our
			
 
				+	 * range, we just split and search again.  It'll get split
			
 
				+	 * again the next time though.
			
 
				+	 *
			
 
				+	 * If the extent we found is inside our range, we set the
			
 
				+	 * desired bit on it.
			
 
				+	 */
			
 
				+	if (state->start < start) {
			
 
				+		prealloc = alloc_extent_state_atomic(prealloc);
			
 
				+		if (!prealloc)
			
 
				+			return -ENOMEM;
			
 
				+		err = split_state(tree, state, prealloc, start);
			
 
				+		BUG_ON(err == -EEXIST);
			
 
				+		prealloc = NULL;
			
 
				+		if (err)
			
 
				+			goto out;
			
 
				+		if (state->end <= end) {
			
 
				+			set_state_bits(tree, state, &bits);
			
 
				+			clear_state_bit(tree, state, &clear_bits, 0);
			
 
				+			merge_state(tree, state);
			
 
				+			if (last_end == (u64)-1)
			
 
				+				goto out;
			
 
				+			start = last_end + 1;
			
 
				+		}
			
 
				+		goto search_again;
			
 
				+	}
			
 
				+	/*
			
 
				+	 * | ---- desired range ---- |
			
 
				+	 *     | state | or               | state |
			
 
				+	 *
			
 
				+	 * There's a hole, we need to insert something in it and
			
 
				+	 * ignore the extent we found.
			
 
				+	 */
			
 
				+	if (state->start > start) {
			
 
				+		u64 this_end;
			
 
				+		if (end < last_start)
			
 
				+			this_end = end;
			
 
				+		else
			
 
				+			this_end = last_start - 1;
			
 
				+
			
 
				+		prealloc = alloc_extent_state_atomic(prealloc);
			
 
				+		if (!prealloc)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		/*
			
 
				+		 * Avoid to free 'prealloc' if it can be merged with
			
 
				+		 * the later extent.
			
 
				+		 */
			
 
				+		err = insert_state(tree, prealloc, start, this_end,
			
 
				+				   &bits);
			
 
				+		BUG_ON(err == -EEXIST);
			
 
				+		if (err) {
			
 
				+			free_extent_state(prealloc);
			
 
				+			prealloc = NULL;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		prealloc = NULL;
			
 
				+		start = this_end + 1;
			
 
				+		goto search_again;
			
 
				+	}
			
 
				+	/*
			
 
				+	 * | ---- desired range ---- |
			
 
				+	 *                        | state |
			
 
				+	 * We need to split the extent, and set the bit
			
 
				+	 * on the first half
			
 
				+	 */
			
 
				+	if (state->start <= end && state->end > end) {
			
 
				+		prealloc = alloc_extent_state_atomic(prealloc);
			
 
				+		if (!prealloc)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		err = split_state(tree, state, prealloc, end + 1);
			
 
				+		BUG_ON(err == -EEXIST);
			
 
				+
			
 
				+		set_state_bits(tree, prealloc, &bits);
			
 
				+		clear_state_bit(tree, prealloc, &clear_bits, 0);
			
 
				+
			
 
				+		merge_state(tree, prealloc);
			
 
				+		prealloc = NULL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	goto search_again;
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock(&tree->lock);
			
 
				+	if (prealloc)
			
 
				+		free_extent_state(prealloc);
			
 
				+
			
 
				+	return err;
			
 
				+
			
 
				+search_again:
			
 
				+	if (start > end)
			
 
				+		goto out;
			
 
				+	spin_unlock(&tree->lock);
			
 
				+	if (mask & __GFP_WAIT)
			
 
				+		cond_resched();
			
 
				+	goto again;
			
 
				+}
			
 
				+
			
 
				 /* wrappers around set/clear extent bit */
			
 
				 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		     gfp_t mask)
			
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 
				 			struct extent_state **cached_state, gfp_t mask)
			
 
				 {
			
 
				 	return set_extent_bit(tree, start, end,
			
 
				-			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
			
 
				+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
			
 
				 			      0, NULL, cached_state, mask);
			
 
				 }
			
 
				 
			
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * When IO fails, either with EIO or csum verification fails, we
			
 
				+ * try other mirrors that might have a good copy of the data.  This
			
 
				+ * io_failure_record is used to record state as we go through all the
			
 
				+ * mirrors.  If another mirror has good data, the page is set up to date
			
 
				+ * and things continue.  If a good mirror can't be found, the original
			
 
				+ * bio end_io callback is called to indicate things have failed.
			
 
				+ */
			
 
				+struct io_failure_record {
			
 
				+	struct page *page;
			
 
				+	u64 start;
			
 
				+	u64 len;
			
 
				+	u64 logical;
			
 
				+	unsigned long bio_flags;
			
 
				+	int this_mirror;
			
 
				+	int failed_mirror;
			
 
				+	int in_validation;
			
 
				+};
			
 
				+
			
 
				+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
			
 
				+				int did_repair)
			
 
				+{
			
 
				+	int ret;
			
 
				+	int err = 0;
			
 
				+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
			
 
				+
			
 
				+	set_state_private(failure_tree, rec->start, 0);
			
 
				+	ret = clear_extent_bits(failure_tree, rec->start,
			
 
				+				rec->start + rec->len - 1,
			
 
				+				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
			
 
				+	if (ret)
			
 
				+		err = ret;
			
 
				+
			
 
				+	if (did_repair) {
			
 
				+		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
			
 
				+					rec->start + rec->len - 1,
			
 
				+					EXTENT_DAMAGED, GFP_NOFS);
			
 
				+		if (ret && !err)
			
 
				+			err = ret;
			
 
				+	}
			
 
				+
			
 
				+	kfree(rec);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void repair_io_failure_callback(struct bio *bio, int err)
			
 
				+{
			
 
				+	complete(bio->bi_private);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this bypasses the standard btrfs submit functions deliberately, as
			
 
				+ * the standard behavior is to write all copies in a raid setup. here we only
			
 
				+ * want to write the one bad copy. so we do the mapping for ourselves and issue
			
 
				+ * submit_bio directly.
			
 
				+ * to avoid any synchonization issues, wait for the data after writing, which
			
 
				+ * actually prevents the read that triggered the error from finishing.
			
 
				+ * currently, there can be no more than two copies of every data bit. thus,
			
 
				+ * exactly one rewrite is required.
			
 
				+ */
			
 
				+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
			
 
				+			u64 length, u64 logical, struct page *page,
			
 
				+			int mirror_num)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct btrfs_device *dev;
			
 
				+	DECLARE_COMPLETION_ONSTACK(compl);
			
 
				+	u64 map_length = 0;
			
 
				+	u64 sector;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				+	int ret;
			
 
				+
			
 
				+	BUG_ON(!mirror_num);
			
 
				+
			
 
				+	bio = bio_alloc(GFP_NOFS, 1);
			
 
				+	if (!bio)
			
 
				+		return -EIO;
			
 
				+	bio->bi_private = &compl;
			
 
				+	bio->bi_end_io = repair_io_failure_callback;
			
 
				+	bio->bi_size = 0;
			
 
				+	map_length = length;
			
 
				+
			
 
				+	ret = btrfs_map_block(map_tree, WRITE, logical,
			
 
				+			      &map_length, &bbio, mirror_num);
			
 
				+	if (ret) {
			
 
				+		bio_put(bio);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+	BUG_ON(mirror_num != bbio->mirror_num);
			
 
				+	sector = bbio->stripes[mirror_num-1].physical >> 9;
			
 
				+	bio->bi_sector = sector;
			
 
				+	dev = bbio->stripes[mirror_num-1].dev;
			
 
				+	kfree(bbio);
			
 
				+	if (!dev || !dev->bdev || !dev->writeable) {
			
 
				+		bio_put(bio);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+	bio->bi_bdev = dev->bdev;
			
 
				+	bio_add_page(bio, page, length, start-page_offset(page));
			
 
				+	submit_bio(WRITE_SYNC, bio);
			
 
				+	wait_for_completion(&compl);
			
 
				+
			
 
				+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
			
 
				+		/* try to remap that extent elsewhere? */
			
 
				+		bio_put(bio);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
			
 
				+			"sector %llu)\n", page->mapping->host->i_ino, start,
			
 
				+			dev->name, sector);
			
 
				+
			
 
				+	bio_put(bio);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * each time an IO finishes, we do a fast check in the IO failure tree
			
 
				+ * to see if we need to process or clean up an io_failure_record
			
 
				+ */
			
 
				+static int clean_io_failure(u64 start, struct page *page)
			
 
				+{
			
 
				+	u64 private;
			
 
				+	u64 private_failure;
			
 
				+	struct io_failure_record *failrec;
			
 
				+	struct btrfs_mapping_tree *map_tree;
			
 
				+	struct extent_state *state;
			
 
				+	int num_copies;
			
 
				+	int did_repair = 0;
			
 
				+	int ret;
			
 
				+	struct inode *inode = page->mapping->host;
			
 
				+
			
 
				+	private = 0;
			
 
				+	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
			
 
				+				(u64)-1, 1, EXTENT_DIRTY, 0);
			
 
				+	if (!ret)
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
			
 
				+				&private_failure);
			
 
				+	if (ret)
			
 
				+		return 0;
			
 
				+
			
 
				+	failrec = (struct io_failure_record *)(unsigned long) private_failure;
			
 
				+	BUG_ON(!failrec->this_mirror);
			
 
				+
			
 
				+	if (failrec->in_validation) {
			
 
				+		/* there was no real error, just free the record */
			
 
				+		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
			
 
				+			 failrec->start);
			
 
				+		did_repair = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&BTRFS_I(inode)->io_tree.lock);
			
 
				+	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
			
 
				+					    failrec->start,
			
 
				+					    EXTENT_LOCKED);
			
 
				+	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
			
 
				+
			
 
				+	if (state && state->start == failrec->start) {
			
 
				+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
			
 
				+		num_copies = btrfs_num_copies(map_tree, failrec->logical,
			
 
				+						failrec->len);
			
 
				+		if (num_copies > 1)  {
			
 
				+			ret = repair_io_failure(map_tree, start, failrec->len,
			
 
				+						failrec->logical, page,
			
 
				+						failrec->failed_mirror);
			
 
				+			did_repair = !ret;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	if (!ret)
			
 
				+		ret = free_io_failure(inode, failrec, did_repair);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * this is a generic handler for readpage errors (default
			
 
				+ * readpage_io_failed_hook). if other copies exist, read those and write back
			
 
				+ * good data to the failed position. does not investigate in remapping the
			
 
				+ * failed extent elsewhere, hoping the device will be smart enough to do this as
			
 
				+ * needed
			
 
				+ */
			
 
				+
			
 
				+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
			
 
				+				u64 start, u64 end, int failed_mirror,
			
 
				+				struct extent_state *state)
			
 
				+{
			
 
				+	struct io_failure_record *failrec = NULL;
			
 
				+	u64 private;
			
 
				+	struct extent_map *em;
			
 
				+	struct inode *inode = page->mapping->host;
			
 
				+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
			
 
				+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			
 
				+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				+	struct bio *bio;
			
 
				+	int num_copies;
			
 
				+	int ret;
			
 
				+	int read_mode;
			
 
				+	u64 logical;
			
 
				+
			
 
				+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
			
 
				+
			
 
				+	ret = get_state_private(failure_tree, start, &private);
			
 
				+	if (ret) {
			
 
				+		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
			
 
				+		if (!failrec)
			
 
				+			return -ENOMEM;
			
 
				+		failrec->start = start;
			
 
				+		failrec->len = end - start + 1;
			
 
				+		failrec->this_mirror = 0;
			
 
				+		failrec->bio_flags = 0;
			
 
				+		failrec->in_validation = 0;
			
 
				+
			
 
				+		read_lock(&em_tree->lock);
			
 
				+		em = lookup_extent_mapping(em_tree, start, failrec->len);
			
 
				+		if (!em) {
			
 
				+			read_unlock(&em_tree->lock);
			
 
				+			kfree(failrec);
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+
			
 
				+		if (em->start > start || em->start + em->len < start) {
			
 
				+			free_extent_map(em);
			
 
				+			em = NULL;
			
 
				+		}
			
 
				+		read_unlock(&em_tree->lock);
			
 
				+
			
 
				+		if (!em || IS_ERR(em)) {
			
 
				+			kfree(failrec);
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+		logical = start - em->start;
			
 
				+		logical = em->block_start + logical;
			
 
				+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
			
 
				+			logical = em->block_start;
			
 
				+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
			
 
				+			extent_set_compress_type(&failrec->bio_flags,
			
 
				+						 em->compress_type);
			
 
				+		}
			
 
				+		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
			
 
				+			 "len=%llu\n", logical, start, failrec->len);
			
 
				+		failrec->logical = logical;
			
 
				+		free_extent_map(em);
			
 
				+
			
 
				+		/* set the bits in the private failure tree */
			
 
				+		ret = set_extent_bits(failure_tree, start, end,
			
 
				+					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
			
 
				+		if (ret >= 0)
			
 
				+			ret = set_state_private(failure_tree, start,
			
 
				+						(u64)(unsigned long)failrec);
			
 
				+		/* set the bits in the inode's tree */
			
 
				+		if (ret >= 0)
			
 
				+			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
			
 
				+						GFP_NOFS);
			
 
				+		if (ret < 0) {
			
 
				+			kfree(failrec);
			
 
				+			return ret;
			
 
				+		}
			
 
				+	} else {
			
 
				+		failrec = (struct io_failure_record *)(unsigned long)private;
			
 
				+		pr_debug("bio_readpage_error: (found) logical=%llu, "
			
 
				+			 "start=%llu, len=%llu, validation=%d\n",
			
 
				+			 failrec->logical, failrec->start, failrec->len,
			
 
				+			 failrec->in_validation);
			
 
				+		/*
			
 
				+		 * when data can be on disk more than twice, add to failrec here
			
 
				+		 * (e.g. with a list for failed_mirror) to make
			
 
				+		 * clean_io_failure() clean all those errors at once.
			
 
				+		 */
			
 
				+	}
			
 
				+	num_copies = btrfs_num_copies(
			
 
				+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
			
 
				+			      failrec->logical, failrec->len);
			
 
				+	if (num_copies == 1) {
			
 
				+		/*
			
 
				+		 * we only have a single copy of the data, so don't bother with
			
 
				+		 * all the retry and error correction code that follows. no
			
 
				+		 * matter what the error is, it is very likely to persist.
			
 
				+		 */
			
 
				+		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
			
 
				+			 "state=%p, num_copies=%d, next_mirror %d, "
			
 
				+			 "failed_mirror %d\n", state, num_copies,
			
 
				+			 failrec->this_mirror, failed_mirror);
			
 
				+		free_io_failure(inode, failrec, 0);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	if (!state) {
			
 
				+		spin_lock(&tree->lock);
			
 
				+		state = find_first_extent_bit_state(tree, failrec->start,
			
 
				+						    EXTENT_LOCKED);
			
 
				+		if (state && state->start != failrec->start)
			
 
				+			state = NULL;
			
 
				+		spin_unlock(&tree->lock);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * there are two premises:
			
 
				+	 *	a) deliver good data to the caller
			
 
				+	 *	b) correct the bad sectors on disk
			
 
				+	 */
			
 
				+	if (failed_bio->bi_vcnt > 1) {
			
 
				+		/*
			
 
				+		 * to fulfill b), we need to know the exact failing sectors, as
			
 
				+		 * we don't want to rewrite any more than the failed ones. thus,
			
 
				+		 * we need separate read requests for the failed bio
			
 
				+		 *
			
 
				+		 * if the following BUG_ON triggers, our validation request got
			
 
				+		 * merged. we need separate requests for our algorithm to work.
			
 
				+		 */
			
 
				+		BUG_ON(failrec->in_validation);
			
 
				+		failrec->in_validation = 1;
			
 
				+		failrec->this_mirror = failed_mirror;
			
 
				+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * we're ready to fulfill a) and b) alongside. get a good copy
			
 
				+		 * of the failed sector and if we succeed, we have setup
			
 
				+		 * everything for repair_io_failure to do the rest for us.
			
 
				+		 */
			
 
				+		if (failrec->in_validation) {
			
 
				+			BUG_ON(failrec->this_mirror != failed_mirror);
			
 
				+			failrec->in_validation = 0;
			
 
				+			failrec->this_mirror = 0;
			
 
				+		}
			
 
				+		failrec->failed_mirror = failed_mirror;
			
 
				+		failrec->this_mirror++;
			
 
				+		if (failrec->this_mirror == failed_mirror)
			
 
				+			failrec->this_mirror++;
			
 
				+		read_mode = READ_SYNC;
			
 
				+	}
			
 
				+
			
 
				+	if (!state || failrec->this_mirror > num_copies) {
			
 
				+		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
			
 
				+			 "next_mirror %d, failed_mirror %d\n", state,
			
 
				+			 num_copies, failrec->this_mirror, failed_mirror);
			
 
				+		free_io_failure(inode, failrec, 0);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	bio = bio_alloc(GFP_NOFS, 1);
			
 
				+	bio->bi_private = state;
			
 
				+	bio->bi_end_io = failed_bio->bi_end_io;
			
 
				+	bio->bi_sector = failrec->logical >> 9;
			
 
				+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
			
 
				+	bio->bi_size = 0;
			
 
				+
			
 
				+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
			
 
				+
			
 
				+	pr_debug("bio_readpage_error: submitting new read[%#x] to "
			
 
				+		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
			
 
				+		 failrec->this_mirror, num_copies, failrec->in_validation);
			
 
				+
			
 
				+	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
			
 
				+					failrec->bio_flags, 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /* lots and lots of room for performance fixes in the end_bio funcs */
			
 
				 
			
 
				 /*
			
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
				 		struct extent_state *cached = NULL;
			
 
				 		struct extent_state *state;
			
 
				 
			
 
				+		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
			
 
				+			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
			
 
				+			 (long int)bio->bi_bdev);
			
 
				 		tree = &BTRFS_I(page->mapping->host)->io_tree;
			
 
				 
			
 
				 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
			
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
				 							      state);
			
 
				 			if (ret)
			
 
				 				uptodate = 0;
			
 
				+			else
			
 
				+				clean_io_failure(start, page);
			
 
				 		}
			
 
				-		if (!uptodate && tree->ops &&
			
 
				-		    tree->ops->readpage_io_failed_hook) {
			
 
				-			ret = tree->ops->readpage_io_failed_hook(bio, page,
			
 
				-							 start, end, NULL);
			
 
				+		if (!uptodate) {
			
 
				+			u64 failed_mirror;
			
 
				+			failed_mirror = (u64)bio->bi_bdev;
			
 
				+			if (tree->ops && tree->ops->readpage_io_failed_hook)
			
 
				+				ret = tree->ops->readpage_io_failed_hook(
			
 
				+						bio, page, start, end,
			
 
				+						failed_mirror, state);
			
 
				+			else
			
 
				+				ret = bio_readpage_error(bio, page, start, end,
			
 
				+							 failed_mirror, NULL);
			
 
				 			if (ret == 0) {
			
 
				 				uptodate =
			
 
				 					test_bit(BIO_UPTODATE, &bio->bi_flags);
			
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 
				 					   mirror_num, bio_flags, start);
			
 
				 	else
			
 
				 		submit_bio(rw, bio);
			
 
				+
			
 
				 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
			
 
				 		ret = -EOPNOTSUPP;
			
 
				 	bio_put(bio);
			
@@ -2076,16 +2639,16 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
				 }
			
 
				 
			
 
				 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
			
 
				-			    get_extent_t *get_extent)
			
 
				+			    get_extent_t *get_extent, int mirror_num)
			
 
				 {
			
 
				 	struct bio *bio = NULL;
			
 
				 	unsigned long bio_flags = 0;
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
			
 
				+	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
			
 
				 				      &bio_flags);
			
 
				 	if (bio)
			
 
				-		ret = submit_one_bio(READ, bio, 0, bio_flags);
			
 
				+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 	int compressed;
			
 
				 	int write_flags;
			
 
				 	unsigned long nr_written = 0;
			
 
				+	bool fill_delalloc = true;
			
 
				 
			
 
				 	if (wbc->sync_mode == WB_SYNC_ALL)
			
 
				 		write_flags = WRITE_SYNC;
			
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 	trace___extent_writepage(page, inode, wbc);
			
 
				 
			
 
				 	WARN_ON(!PageLocked(page));
			
 
				+
			
 
				+	ClearPageError(page);
			
 
				+
			
 
				 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
			
 
				 	if (page->index > end_index ||
			
 
				 	   (page->index == end_index && !pg_offset)) {
			
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 
			
 
				 	set_page_extent_mapped(page);
			
 
				 
			
 
				+	if (!tree->ops || !tree->ops->fill_delalloc)
			
 
				+		fill_delalloc = false;
			
 
				+
			
 
				 	delalloc_start = start;
			
 
				 	delalloc_end = 0;
			
 
				 	page_started = 0;
			
 
				-	if (!epd->extent_locked) {
			
 
				+	if (!epd->extent_locked && fill_delalloc) {
			
 
				 		u64 delalloc_to_write = 0;
			
 
				 		/*
			
 
				 		 * make sure the wbc mapping index is at least updated
			
@@ -2421,10 +2991,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 
				 			 * swizzled back from swapper_space to tmpfs file
			
 
				 			 * mapping
			
 
				 			 */
			
 
				-			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
			
 
				-				tree->ops->write_cache_pages_lock_hook(page);
			
 
				-			else
			
 
				-				lock_page(page);
			
 
				+			if (tree->ops &&
			
 
				+			    tree->ops->write_cache_pages_lock_hook) {
			
 
				+				tree->ops->write_cache_pages_lock_hook(page,
			
 
				+							       data, flush_fn);
			
 
				+			} else {
			
 
				+				if (!trylock_page(page)) {
			
 
				+					flush_fn(data);
			
 
				+					lock_page(page);
			
 
				+				}
			
 
				+			}
			
 
				 
			
 
				 			if (unlikely(page->mapping != mapping)) {
			
 
				 				unlock_page(page);
			
@@ -2926,7 +3502,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
			
 
				+inline struct page *extent_buffer_page(struct extent_buffer *eb,
			
 
				 					      unsigned long i)
			
 
				 {
			
 
				 	struct page *p;
			
@@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 
				 	return p;
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long num_extent_pages(u64 start, u64 len)
			
 
				+inline unsigned long num_extent_pages(u64 start, u64 len)
			
 
				 {
			
 
				 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
			
 
				 		(start >> PAGE_CACHE_SHIFT);
			
@@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 
				 						PAGECACHE_TAG_DIRTY);
			
 
				 		}
			
 
				 		spin_unlock_irq(&page->mapping->tree_lock);
			
 
				+		ClearPageError(page);
			
 
				 		unlock_page(page);
			
 
				 	}
			
 
				 	return 0;
			
@@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 
				 }
			
 
				 
			
 
				 int read_extent_buffer_pages(struct extent_io_tree *tree,
			
 
				-			     struct extent_buffer *eb,
			
 
				-			     u64 start, int wait,
			
 
				+			     struct extent_buffer *eb, u64 start, int wait,
			
 
				 			     get_extent_t *get_extent, int mirror_num)
			
 
				 {
			
 
				 	unsigned long i;
			
@@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
				 	num_pages = num_extent_pages(eb->start, eb->len);
			
 
				 	for (i = start_i; i < num_pages; i++) {
			
 
				 		page = extent_buffer_page(eb, i);
			
 
				-		if (!wait) {
			
 
				+		if (wait == WAIT_NONE) {
			
 
				 			if (!trylock_page(page))
			
 
				 				goto unlock_exit;
			
 
				 		} else {
			
@@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
				 	if (bio)
			
 
				 		submit_one_bio(READ, bio, mirror_num, bio_flags);
			
 
				 
			
 
				-	if (ret || !wait)
			
 
				+	if (ret || wait != WAIT_COMPLETE)
			
 
				 		return ret;
			
 
				 
			
 
				 	for (i = start_i; i < num_pages; i++) {
			
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
 
				 #define EXTENT_NODATASUM (1 << 10)
			
 
				 #define EXTENT_DO_ACCOUNTING (1 << 11)
			
 
				 #define EXTENT_FIRST_DELALLOC (1 << 12)
			
 
				+#define EXTENT_NEED_WAIT (1 << 13)
			
 
				+#define EXTENT_DAMAGED (1 << 14)
			
 
				 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
			
 
				 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
			
 
				 
			
@@ -32,6 +34,7 @@
 
				 #define EXTENT_BUFFER_BLOCKING 1
			
 
				 #define EXTENT_BUFFER_DIRTY 2
			
 
				 #define EXTENT_BUFFER_CORRUPT 3
			
 
				+#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
			
 
				 
			
 
				 /* these are flags for extent_clear_unlock_delalloc */
			
 
				 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
			
@@ -67,7 +70,7 @@ struct extent_io_ops {
 
				 			      unsigned long bio_flags);
			
 
				 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
			
 
				 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
			
 
				-				       u64 start, u64 end,
			
 
				+				       u64 start, u64 end, u64 failed_mirror,
			
 
				 				       struct extent_state *state);
			
 
				 	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
			
 
				 					u64 start, u64 end,
			
@@ -85,7 +88,8 @@ struct extent_io_ops {
 
				 				  struct extent_state *other);
			
 
				 	void (*split_extent_hook)(struct inode *inode,
			
 
				 				  struct extent_state *orig, u64 split);
			
 
				-	int (*write_cache_pages_lock_hook)(struct page *page);
			
 
				+	int (*write_cache_pages_lock_hook)(struct page *page, void *data,
			
 
				+					   void (*flush_fn)(void *));
			
 
				 };
			
 
				 
			
 
				 struct extent_io_tree {
			
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 
				 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		    gfp_t mask);
			
 
				 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
			
 
				-			  get_extent_t *get_extent);
			
 
				+			  get_extent_t *get_extent, int mirror_num);
			
 
				 int __init extent_io_init(void);
			
 
				 void extent_io_exit(void);
			
 
				 
			
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 
				 		     gfp_t mask);
			
 
				 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		       gfp_t mask);
			
 
				+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				+		       int bits, int clear_bits, gfp_t mask);
			
 
				 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 			struct extent_state **cached_state, gfp_t mask);
			
 
				 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
			
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 
				 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
			
 
				 					 u64 start, unsigned long len);
			
 
				 void free_extent_buffer(struct extent_buffer *eb);
			
 
				+#define WAIT_NONE	0
			
 
				+#define WAIT_COMPLETE	1
			
 
				+#define WAIT_PAGE_LOCK	2
			
 
				 int read_extent_buffer_pages(struct extent_io_tree *tree,
			
 
				 			     struct extent_buffer *eb, u64 start, int wait,
			
 
				 			     get_extent_t *get_extent, int mirror_num);
			
 
				+unsigned long num_extent_pages(u64 start, u64 len);
			
 
				+struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
			
 
				 
			
 
				 static inline void extent_buffer_get(struct extent_buffer *eb)
			
 
				 {
			
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
				 struct bio *
			
 
				 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
			
 
				 		gfp_t gfp_flags);
			
 
				+
			
 
				+struct btrfs_mapping_tree;
			
 
				+
			
 
				+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
			
 
				+			u64 length, u64 logical, struct page *page,
			
 
				+			int mirror_num);
			
 
				 #endif
			
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_csum_item *item;
			
 
				 	struct extent_buffer *leaf;
			
 
				 	u64 csum_offset = 0;
			
 
				-	u16 csum_size =
			
 
				-		btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 	int csums_in_item;
			
 
				 
			
 
				 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
			
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 
				 	u64 item_last_offset = 0;
			
 
				 	u64 disk_bytenr;
			
 
				 	u32 diff;
			
 
				-	u16 csum_size =
			
 
				-		btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 	int ret;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_csum_item *item = NULL;
			
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 
				 	int ret;
			
 
				 	size_t size;
			
 
				 	u64 csum_end;
			
 
				-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
 
				 				      u64 bytenr, u64 len)
			
 
				 {
			
 
				 	struct extent_buffer *leaf;
			
 
				-	u16 csum_size =
			
 
				-		btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 	u64 csum_end;
			
 
				 	u64 end_byte = bytenr + len;
			
 
				 	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
			
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
				 	u64 csum_end;
			
 
				 	struct extent_buffer *leaf;
			
 
				 	int ret;
			
 
				-	u16 csum_size =
			
 
				-		btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
			
 
				 
			
 
				 	root = root->fs_info->csum_root;
			
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_sector_sum *sector_sum;
			
 
				 	u32 nritems;
			
 
				 	u32 ins_size;
			
 
				-	u16 csum_size =
			
 
				-		btrfs_super_csum_size(&root->fs_info->super_copy);
			
 
				+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 
				 	int i;
			
 
				 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
			
 
				 	struct inode *inode = fdentry(file)->d_inode;
			
 
				+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
			
 
				 	int err = 0;
			
 
				 	int faili = 0;
			
 
				 	u64 start_pos;
			
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 
				 again:
			
 
				 	for (i = 0; i < num_pages; i++) {
			
 
				 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
			
 
				-					       GFP_NOFS);
			
 
				+					       mask);
			
 
				 		if (!pages[i]) {
			
 
				 			faili = i - 1;
			
 
				 			err = -ENOMEM;
			
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 			goto out;
			
 
				 	}
			
 
				 
			
 
				-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				-
			
 
				 	locked_end = alloc_end - 1;
			
 
				 	while (1) {
			
 
				 		struct btrfs_ordered_extent *ordered;
			
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 		if (em->block_start == EXTENT_MAP_HOLE ||
			
 
				 		    (cur_offset >= inode->i_size &&
			
 
				 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
			
 
				+
			
 
				+			/*
			
 
				+			 * Make sure we have enough space before we do the
			
 
				+			 * allocation.
			
 
				+			 */
			
 
				+			ret = btrfs_check_data_free_space(inode, last_byte -
			
 
				+							  cur_offset);
			
 
				+			if (ret) {
			
 
				+				free_extent_map(em);
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				 			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
			
 
				 							last_byte - cur_offset,
			
 
				 							1 << inode->i_blkbits,
			
 
				 							offset + len,
			
 
				 							&alloc_hint);
			
 
				+
			
 
				+			/* Let go of our reservation. */
			
 
				+			btrfs_free_reserved_data_space(inode, last_byte -
			
 
				+						       cur_offset);
			
 
				 			if (ret < 0) {
			
 
				 				free_extent_map(em);
			
 
				 				break;
			
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 	}
			
 
				 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
			
 
				 			     &cached_state, GFP_NOFS);
			
 
				-
			
 
				-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
			
 
				 out:
			
 
				 	mutex_unlock(&inode->i_mutex);
			
 
				 	return ret;
			
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
 
				 #include <linux/sched.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/math64.h>
			
 
				+#include <linux/ratelimit.h>
			
 
				 #include "ctree.h"
			
 
				 #include "free-space-cache.h"
			
 
				 #include "transaction.h"
			
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 
				 				      *block_group, struct btrfs_path *path)
			
 
				 {
			
 
				 	struct inode *inode = NULL;
			
 
				+	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
			
 
				 
			
 
				 	spin_lock(&block_group->lock);
			
 
				 	if (block_group->inode)
			
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 
				 		return inode;
			
 
				 
			
 
				 	spin_lock(&block_group->lock);
			
 
				-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
			
 
				+	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
			
 
				 		printk(KERN_INFO "Old style space inode found, converting.\n");
			
 
				-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
			
 
				+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
			
 
				+			BTRFS_INODE_NODATACOW;
			
 
				 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
			
 
				 	}
			
 
				 
			
 
				-	if (!btrfs_fs_closing(root->fs_info)) {
			
 
				+	if (!block_group->iref) {
			
 
				 		block_group->inode = igrab(inode);
			
 
				 		block_group->iref = 1;
			
 
				 	}
			
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
 
				 	struct btrfs_free_space_header *header;
			
 
				 	struct btrfs_inode_item *inode_item;
			
 
				 	struct extent_buffer *leaf;
			
 
				+	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
			
 
				 	int ret;
			
 
				 
			
 
				 	ret = btrfs_insert_empty_inode(trans, root, path, ino);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				+	/* We inline crc's for the free disk space cache */
			
 
				+	if (ino != BTRFS_FREE_INO_OBJECTID)
			
 
				+		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
			
 
				+
			
 
				 	leaf = path->nodes[0];
			
 
				 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
			
 
				 				    struct btrfs_inode_item);
			
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
 
				 	btrfs_set_inode_uid(leaf, inode_item, 0);
			
 
				 	btrfs_set_inode_gid(leaf, inode_item, 0);
			
 
				 	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
			
 
				-	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
			
 
				-			      BTRFS_INODE_PREALLOC);
			
 
				+	btrfs_set_inode_flags(leaf, inode_item, flags);
			
 
				 	btrfs_set_inode_nlink(leaf, inode_item, 1);
			
 
				 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
			
 
				 	btrfs_set_inode_block_group(leaf, inode_item, offset);
			
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
				 				    struct inode *inode)
			
 
				 {
			
 
				 	struct btrfs_block_rsv *rsv;
			
 
				+	u64 needed_bytes;
			
 
				 	loff_t oldsize;
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	rsv = trans->block_rsv;
			
 
				-	trans->block_rsv = root->orphan_block_rsv;
			
 
				-	ret = btrfs_block_rsv_check(trans, root,
			
 
				-				    root->orphan_block_rsv,
			
 
				-				    0, 5);
			
 
				-	if (ret)
			
 
				-		return ret;
			
 
				+	trans->block_rsv = &root->fs_info->global_block_rsv;
			
 
				+
			
 
				+	/* 1 for slack space, 1 for updating the inode */
			
 
				+	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
			
 
				+		btrfs_calc_trans_metadata_size(root, 1);
			
 
				+
			
 
				+	spin_lock(&trans->block_rsv->lock);
			
 
				+	if (trans->block_rsv->reserved < needed_bytes) {
			
 
				+		spin_unlock(&trans->block_rsv->lock);
			
 
				+		trans->block_rsv = rsv;
			
 
				+		return -ENOSPC;
			
 
				+	}
			
 
				+	spin_unlock(&trans->block_rsv->lock);
			
 
				 
			
 
				 	oldsize = i_size_read(inode);
			
 
				 	btrfs_i_size_write(inode, 0);
			
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
				 	ret = btrfs_truncate_inode_items(trans, root, inode,
			
 
				 					 0, BTRFS_EXTENT_DATA_KEY);
			
 
				 
			
 
				-	trans->block_rsv = rsv;
			
 
				 	if (ret) {
			
 
				+		trans->block_rsv = rsv;
			
 
				 		WARN_ON(1);
			
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				 	ret = btrfs_update_inode(trans, root, inode);
			
 
				+	trans->block_rsv = rsv;
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+struct io_ctl {
			
 
				+	void *cur, *orig;
			
 
				+	struct page *page;
			
 
				+	struct page **pages;
			
 
				+	struct btrfs_root *root;
			
 
				+	unsigned long size;
			
 
				+	int index;
			
 
				+	int num_pages;
			
 
				+	unsigned check_crcs:1;
			
 
				+};
			
 
				+
			
 
				+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
			
 
				+		       struct btrfs_root *root)
			
 
				+{
			
 
				+	memset(io_ctl, 0, sizeof(struct io_ctl));
			
 
				+	io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
			
 
				+		PAGE_CACHE_SHIFT;
			
 
				+	io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
			
 
				+				GFP_NOFS);
			
 
				+	if (!io_ctl->pages)
			
 
				+		return -ENOMEM;
			
 
				+	io_ctl->root = root;
			
 
				+	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
			
 
				+		io_ctl->check_crcs = 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_free(struct io_ctl *io_ctl)
			
 
				+{
			
 
				+	kfree(io_ctl->pages);
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
			
 
				+{
			
 
				+	if (io_ctl->cur) {
			
 
				+		kunmap(io_ctl->page);
			
 
				+		io_ctl->cur = NULL;
			
 
				+		io_ctl->orig = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
			
 
				+{
			
 
				+	WARN_ON(io_ctl->cur);
			
 
				+	BUG_ON(io_ctl->index >= io_ctl->num_pages);
			
 
				+	io_ctl->page = io_ctl->pages[io_ctl->index++];
			
 
				+	io_ctl->cur = kmap(io_ctl->page);
			
 
				+	io_ctl->orig = io_ctl->cur;
			
 
				+	io_ctl->size = PAGE_CACHE_SIZE;
			
 
				+	if (clear)
			
 
				+		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	io_ctl_unmap_page(io_ctl);
			
 
				+
			
 
				+	for (i = 0; i < io_ctl->num_pages; i++) {
			
 
				+		ClearPageChecked(io_ctl->pages[i]);
			
 
				+		unlock_page(io_ctl->pages[i]);
			
 
				+		page_cache_release(io_ctl->pages[i]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
			
 
				+				int uptodate)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < io_ctl->num_pages; i++) {
			
 
				+		page = find_or_create_page(inode->i_mapping, i, mask);
			
 
				+		if (!page) {
			
 
				+			io_ctl_drop_pages(io_ctl);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+		io_ctl->pages[i] = page;
			
 
				+		if (uptodate && !PageUptodate(page)) {
			
 
				+			btrfs_readpage(NULL, page);
			
 
				+			lock_page(page);
			
 
				+			if (!PageUptodate(page)) {
			
 
				+				printk(KERN_ERR "btrfs: error reading free "
			
 
				+				       "space cache\n");
			
 
				+				io_ctl_drop_pages(io_ctl);
			
 
				+				return -EIO;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
			
 
				+{
			
 
				+	u64 *val;
			
 
				+
			
 
				+	io_ctl_map_page(io_ctl, 1);
			
 
				+
			
 
				+	/*
			
 
				+	 * Skip the csum areas.  If we don't check crcs then we just have a
			
 
				+	 * 64bit chunk at the front of the first page.
			
 
				+	 */
			
 
				+	if (io_ctl->check_crcs) {
			
 
				+		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
			
 
				+		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
			
 
				+	} else {
			
 
				+		io_ctl->cur += sizeof(u64);
			
 
				+		io_ctl->size -= sizeof(u64) * 2;
			
 
				+	}
			
 
				+
			
 
				+	val = io_ctl->cur;
			
 
				+	*val = cpu_to_le64(generation);
			
 
				+	io_ctl->cur += sizeof(u64);
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
			
 
				+{
			
 
				+	u64 *gen;
			
 
				+
			
 
				+	/*
			
 
				+	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
			
 
				+	 * chunk at the front of the first page.
			
 
				+	 */
			
 
				+	if (io_ctl->check_crcs) {
			
 
				+		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
			
 
				+		io_ctl->size -= sizeof(u64) +
			
 
				+			(sizeof(u32) * io_ctl->num_pages);
			
 
				+	} else {
			
 
				+		io_ctl->cur += sizeof(u64);
			
 
				+		io_ctl->size -= sizeof(u64) * 2;
			
 
				+	}
			
 
				+
			
 
				+	gen = io_ctl->cur;
			
 
				+	if (le64_to_cpu(*gen) != generation) {
			
 
				+		printk_ratelimited(KERN_ERR "btrfs: space cache generation "
			
 
				+				   "(%Lu) does not match inode (%Lu)\n", *gen,
			
 
				+				   generation);
			
 
				+		io_ctl_unmap_page(io_ctl);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+	io_ctl->cur += sizeof(u64);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
			
 
				+{
			
 
				+	u32 *tmp;
			
 
				+	u32 crc = ~(u32)0;
			
 
				+	unsigned offset = 0;
			
 
				+
			
 
				+	if (!io_ctl->check_crcs) {
			
 
				+		io_ctl_unmap_page(io_ctl);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (index == 0)
			
 
				+		offset = sizeof(u32) * io_ctl->num_pages;;
			
 
				+
			
 
				+	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
			
 
				+			      PAGE_CACHE_SIZE - offset);
			
 
				+	btrfs_csum_final(crc, (char *)&crc);
			
 
				+	io_ctl_unmap_page(io_ctl);
			
 
				+	tmp = kmap(io_ctl->pages[0]);
			
 
				+	tmp += index;
			
 
				+	*tmp = crc;
			
 
				+	kunmap(io_ctl->pages[0]);
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
			
 
				+{
			
 
				+	u32 *tmp, val;
			
 
				+	u32 crc = ~(u32)0;
			
 
				+	unsigned offset = 0;
			
 
				+
			
 
				+	if (!io_ctl->check_crcs) {
			
 
				+		io_ctl_map_page(io_ctl, 0);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (index == 0)
			
 
				+		offset = sizeof(u32) * io_ctl->num_pages;
			
 
				+
			
 
				+	tmp = kmap(io_ctl->pages[0]);
			
 
				+	tmp += index;
			
 
				+	val = *tmp;
			
 
				+	kunmap(io_ctl->pages[0]);
			
 
				+
			
 
				+	io_ctl_map_page(io_ctl, 0);
			
 
				+	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
			
 
				+			      PAGE_CACHE_SIZE - offset);
			
 
				+	btrfs_csum_final(crc, (char *)&crc);
			
 
				+	if (val != crc) {
			
 
				+		printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
			
 
				+				   "space cache\n");
			
 
				+		io_ctl_unmap_page(io_ctl);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
			
 
				+			    void *bitmap)
			
 
				+{
			
 
				+	struct btrfs_free_space_entry *entry;
			
 
				+
			
 
				+	if (!io_ctl->cur)
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	entry = io_ctl->cur;
			
 
				+	entry->offset = cpu_to_le64(offset);
			
 
				+	entry->bytes = cpu_to_le64(bytes);
			
 
				+	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
			
 
				+		BTRFS_FREE_SPACE_EXTENT;
			
 
				+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
			
 
				+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
			
 
				+
			
 
				+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
			
 
				+		return 0;
			
 
				+
			
 
				+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
			
 
				+
			
 
				+	/* No more pages to map */
			
 
				+	if (io_ctl->index >= io_ctl->num_pages)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* map the next page */
			
 
				+	io_ctl_map_page(io_ctl, 1);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
			
 
				+{
			
 
				+	if (!io_ctl->cur)
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we aren't at the start of the current page, unmap this one and
			
 
				+	 * map the next one if there is any left.
			
 
				+	 */
			
 
				+	if (io_ctl->cur != io_ctl->orig) {
			
 
				+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
			
 
				+		if (io_ctl->index >= io_ctl->num_pages)
			
 
				+			return -ENOSPC;
			
 
				+		io_ctl_map_page(io_ctl, 0);
			
 
				+	}
			
 
				+
			
 
				+	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
			
 
				+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
			
 
				+	if (io_ctl->index < io_ctl->num_pages)
			
 
				+		io_ctl_map_page(io_ctl, 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
			
 
				+{
			
 
				+	/*
			
 
				+	 * If we're not on the boundary we know we've modified the page and we
			
 
				+	 * need to crc the page.
			
 
				+	 */
			
 
				+	if (io_ctl->cur != io_ctl->orig)
			
 
				+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
			
 
				+	else
			
 
				+		io_ctl_unmap_page(io_ctl);
			
 
				+
			
 
				+	while (io_ctl->index < io_ctl->num_pages) {
			
 
				+		io_ctl_map_page(io_ctl, 1);
			
 
				+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_read_entry(struct io_ctl *io_ctl,
			
 
				+			    struct btrfs_free_space *entry, u8 *type)
			
 
				+{
			
 
				+	struct btrfs_free_space_entry *e;
			
 
				+
			
 
				+	e = io_ctl->cur;
			
 
				+	entry->offset = le64_to_cpu(e->offset);
			
 
				+	entry->bytes = le64_to_cpu(e->bytes);
			
 
				+	*type = e->type;
			
 
				+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
			
 
				+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
			
 
				+
			
 
				+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
			
 
				+		return 0;
			
 
				+
			
 
				+	io_ctl_unmap_page(io_ctl);
			
 
				+
			
 
				+	if (io_ctl->index >= io_ctl->num_pages)
			
 
				+		return 0;
			
 
				+
			
 
				+	return io_ctl_check_crc(io_ctl, io_ctl->index);
			
 
				+}
			
 
				+
			
 
				+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
			
 
				+			      struct btrfs_free_space *entry)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
			
 
				+		io_ctl_unmap_page(io_ctl);
			
 
				+
			
 
				+	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
			
 
				+	io_ctl_unmap_page(io_ctl);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
			
 
				 			    struct btrfs_free_space_ctl *ctl,
			
 
				 			    struct btrfs_path *path, u64 offset)
			
 
				 {
			
 
				 	struct btrfs_free_space_header *header;
			
 
				 	struct extent_buffer *leaf;
			
 
				-	struct page *page;
			
 
				+	struct io_ctl io_ctl;
			
 
				 	struct btrfs_key key;
			
 
				+	struct btrfs_free_space *e, *n;
			
 
				 	struct list_head bitmaps;
			
 
				 	u64 num_entries;
			
 
				 	u64 num_bitmaps;
			
 
				 	u64 generation;
			
 
				-	pgoff_t index = 0;
			
 
				+	u8 type;
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	INIT_LIST_HEAD(&bitmaps);
			
 
				 
			
 
				 	/* Nothing in the space cache, goodbye */
			
 
				 	if (!i_size_read(inode))
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 
			
 
				 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
			
 
				 	key.offset = offset;
			
@@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
				 
			
 
				 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				 	if (ret < 0)
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 	else if (ret > 0) {
			
 
				 		btrfs_release_path(path);
			
 
				-		ret = 0;
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	ret = -1;
			
@@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
				 		       " not match free space cache generation (%llu)\n",
			
 
				 		       (unsigned long long)BTRFS_I(inode)->generation,
			
 
				 		       (unsigned long long)generation);
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	if (!num_entries)
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 
			
 
				+	io_ctl_init(&io_ctl, inode, root);
			
 
				 	ret = readahead_cache(inode);
			
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 
			
 
				-	while (1) {
			
 
				-		struct btrfs_free_space_entry *entry;
			
 
				-		struct btrfs_free_space *e;
			
 
				-		void *addr;
			
 
				-		unsigned long offset = 0;
			
 
				-		int need_loop = 0;
			
 
				+	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				 
			
 
				-		if (!num_entries && !num_bitmaps)
			
 
				-			break;
			
 
				+	ret = io_ctl_check_crc(&io_ctl, 0);
			
 
				+	if (ret)
			
 
				+		goto free_cache;
			
 
				 
			
 
				-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
			
 
				-		if (!page)
			
 
				+	ret = io_ctl_check_generation(&io_ctl, generation);
			
 
				+	if (ret)
			
 
				+		goto free_cache;
			
 
				+
			
 
				+	while (num_entries) {
			
 
				+		e = kmem_cache_zalloc(btrfs_free_space_cachep,
			
 
				+				      GFP_NOFS);
			
 
				+		if (!e)
			
 
				 			goto free_cache;
			
 
				 
			
 
				-		if (!PageUptodate(page)) {
			
 
				-			btrfs_readpage(NULL, page);
			
 
				-			lock_page(page);
			
 
				-			if (!PageUptodate(page)) {
			
 
				-				unlock_page(page);
			
 
				-				page_cache_release(page);
			
 
				-				printk(KERN_ERR "btrfs: error reading free "
			
 
				-				       "space cache\n");
			
 
				-				goto free_cache;
			
 
				-			}
			
 
				+		ret = io_ctl_read_entry(&io_ctl, e, &type);
			
 
				+		if (ret) {
			
 
				+			kmem_cache_free(btrfs_free_space_cachep, e);
			
 
				+			goto free_cache;
			
 
				 		}
			
 
				-		addr = kmap(page);
			
 
				 
			
 
				-		if (index == 0) {
			
 
				-			u64 *gen;
			
 
				+		if (!e->bytes) {
			
 
				+			kmem_cache_free(btrfs_free_space_cachep, e);
			
 
				+			goto free_cache;
			
 
				+		}
			
 
				 
			
 
				-			/*
			
 
				-			 * We put a bogus crc in the front of the first page in
			
 
				-			 * case old kernels try to mount a fs with the new
			
 
				-			 * format to make sure they discard the cache.
			
 
				-			 */
			
 
				-			addr += sizeof(u64);
			
 
				-			offset += sizeof(u64);
			
 
				-
			
 
				-			gen = addr;
			
 
				-			if (*gen != BTRFS_I(inode)->generation) {
			
 
				-				printk(KERN_ERR "btrfs: space cache generation"
			
 
				-				       " (%llu) does not match inode (%llu)\n",
			
 
				-				       (unsigned long long)*gen,
			
 
				-				       (unsigned long long)
			
 
				-				       BTRFS_I(inode)->generation);
			
 
				-				kunmap(page);
			
 
				-				unlock_page(page);
			
 
				-				page_cache_release(page);
			
 
				+		if (type == BTRFS_FREE_SPACE_EXTENT) {
			
 
				+			spin_lock(&ctl->tree_lock);
			
 
				+			ret = link_free_space(ctl, e);
			
 
				+			spin_unlock(&ctl->tree_lock);
			
 
				+			if (ret) {
			
 
				+				printk(KERN_ERR "Duplicate entries in "
			
 
				+				       "free space cache, dumping\n");
			
 
				+				kmem_cache_free(btrfs_free_space_cachep, e);
			
 
				 				goto free_cache;
			
 
				 			}
			
 
				-			addr += sizeof(u64);
			
 
				-			offset += sizeof(u64);
			
 
				-		}
			
 
				-		entry = addr;
			
 
				-
			
 
				-		while (1) {
			
 
				-			if (!num_entries)
			
 
				-				break;
			
 
				-
			
 
				-			need_loop = 1;
			
 
				-			e = kmem_cache_zalloc(btrfs_free_space_cachep,
			
 
				-					      GFP_NOFS);
			
 
				-			if (!e) {
			
 
				-				kunmap(page);
			
 
				-				unlock_page(page);
			
 
				-				page_cache_release(page);
			
 
				+		} else {
			
 
				+			BUG_ON(!num_bitmaps);
			
 
				+			num_bitmaps--;
			
 
				+			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
			
 
				+			if (!e->bitmap) {
			
 
				+				kmem_cache_free(
			
 
				+					btrfs_free_space_cachep, e);
			
 
				 				goto free_cache;
			
 
				 			}
			
 
				-
			
 
				-			e->offset = le64_to_cpu(entry->offset);
			
 
				-			e->bytes = le64_to_cpu(entry->bytes);
			
 
				-			if (!e->bytes) {
			
 
				-				kunmap(page);
			
 
				+			spin_lock(&ctl->tree_lock);
			
 
				+			ret = link_free_space(ctl, e);
			
 
				+			ctl->total_bitmaps++;
			
 
				+			ctl->op->recalc_thresholds(ctl);
			
 
				+			spin_unlock(&ctl->tree_lock);
			
 
				+			if (ret) {
			
 
				+				printk(KERN_ERR "Duplicate entries in "
			
 
				+				       "free space cache, dumping\n");
			
 
				 				kmem_cache_free(btrfs_free_space_cachep, e);
			
 
				-				unlock_page(page);
			
 
				-				page_cache_release(page);
			
 
				 				goto free_cache;
			
 
				 			}
			
 
				-
			
 
				-			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
			
 
				-				spin_lock(&ctl->tree_lock);
			
 
				-				ret = link_free_space(ctl, e);
			
 
				-				spin_unlock(&ctl->tree_lock);
			
 
				-				if (ret) {
			
 
				-					printk(KERN_ERR "Duplicate entries in "
			
 
				-					       "free space cache, dumping\n");
			
 
				-					kunmap(page);
			
 
				-					unlock_page(page);
			
 
				-					page_cache_release(page);
			
 
				-					goto free_cache;
			
 
				-				}
			
 
				-			} else {
			
 
				-				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
			
 
				-				if (!e->bitmap) {
			
 
				-					kunmap(page);
			
 
				-					kmem_cache_free(
			
 
				-						btrfs_free_space_cachep, e);
			
 
				-					unlock_page(page);
			
 
				-					page_cache_release(page);
			
 
				-					goto free_cache;
			
 
				-				}
			
 
				-				spin_lock(&ctl->tree_lock);
			
 
				-				ret = link_free_space(ctl, e);
			
 
				-				ctl->total_bitmaps++;
			
 
				-				ctl->op->recalc_thresholds(ctl);
			
 
				-				spin_unlock(&ctl->tree_lock);
			
 
				-				if (ret) {
			
 
				-					printk(KERN_ERR "Duplicate entries in "
			
 
				-					       "free space cache, dumping\n");
			
 
				-					kunmap(page);
			
 
				-					unlock_page(page);
			
 
				-					page_cache_release(page);
			
 
				-					goto free_cache;
			
 
				-				}
			
 
				-				list_add_tail(&e->list, &bitmaps);
			
 
				-			}
			
 
				-
			
 
				-			num_entries--;
			
 
				-			offset += sizeof(struct btrfs_free_space_entry);
			
 
				-			if (offset + sizeof(struct btrfs_free_space_entry) >=
			
 
				-			    PAGE_CACHE_SIZE)
			
 
				-				break;
			
 
				-			entry++;
			
 
				+			list_add_tail(&e->list, &bitmaps);
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-		 * We read an entry out of this page, we need to move on to the
			
 
				-		 * next page.
			
 
				-		 */
			
 
				-		if (need_loop) {
			
 
				-			kunmap(page);
			
 
				-			goto next;
			
 
				-		}
			
 
				+		num_entries--;
			
 
				+	}
			
 
				 
			
 
				-		/*
			
 
				-		 * We add the bitmaps at the end of the entries in order that
			
 
				-		 * the bitmap entries are added to the cache.
			
 
				-		 */
			
 
				-		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
			
 
				+	/*
			
 
				+	 * We add the bitmaps at the end of the entries in order that
			
 
				+	 * the bitmap entries are added to the cache.
			
 
				+	 */
			
 
				+	list_for_each_entry_safe(e, n, &bitmaps, list) {
			
 
				 		list_del_init(&e->list);
			
 
				-		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
			
 
				-		kunmap(page);
			
 
				-		num_bitmaps--;
			
 
				-next:
			
 
				-		unlock_page(page);
			
 
				-		page_cache_release(page);
			
 
				-		index++;
			
 
				+		ret = io_ctl_read_bitmap(&io_ctl, e);
			
 
				+		if (ret)
			
 
				+			goto free_cache;
			
 
				 	}
			
 
				 
			
 
				+	io_ctl_drop_pages(&io_ctl);
			
 
				 	ret = 1;
			
 
				 out:
			
 
				+	io_ctl_free(&io_ctl);
			
 
				 	return ret;
			
 
				 free_cache:
			
 
				+	io_ctl_drop_pages(&io_ctl);
			
 
				 	__btrfs_remove_free_space_cache(ctl);
			
 
				 	goto out;
			
 
				 }
			
@@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 
				 	struct btrfs_root *root = fs_info->tree_root;
			
 
				 	struct inode *inode;
			
 
				 	struct btrfs_path *path;
			
 
				-	int ret;
			
 
				+	int ret = 0;
			
 
				 	bool matched;
			
 
				 	u64 used = btrfs_block_group_used(&block_group->item);
			
 
				 
			
@@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				+	/* We may have converted the inode and made the cache invalid. */
			
 
				+	spin_lock(&block_group->lock);
			
 
				+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
			
 
				+		spin_unlock(&block_group->lock);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	spin_unlock(&block_group->lock);
			
 
				+
			
 
				 	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
			
 
				 				      path, block_group->key.objectid);
			
 
				 	btrfs_free_path(path);
			
@@ -530,6 +801,19 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * __btrfs_write_out_cache - write out cached info to an inode
			
 
				+ * @root - the root the inode belongs to
			
 
				+ * @ctl - the free space cache we are going to write out
			
 
				+ * @block_group - the block_group for this cache if it belongs to a block_group
			
 
				+ * @trans - the trans handle
			
 
				+ * @path - the path to use
			
 
				+ * @offset - the offset for the key we'll insert
			
 
				+ *
			
 
				+ * This function writes out a free space cache struct to disk for quick recovery
			
 
				+ * on mount.  This will return 0 if it was successfull in writing the cache out,
			
 
				+ * and -1 if it was not.
			
 
				+ */
			
 
				 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
			
 
				 			    struct btrfs_free_space_ctl *ctl,
			
 
				 			    struct btrfs_block_group_cache *block_group,
			
@@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 	struct extent_buffer *leaf;
			
 
				 	struct rb_node *node;
			
 
				 	struct list_head *pos, *n;
			
 
				-	struct page **pages;
			
 
				-	struct page *page;
			
 
				 	struct extent_state *cached_state = NULL;
			
 
				 	struct btrfs_free_cluster *cluster = NULL;
			
 
				 	struct extent_io_tree *unpin = NULL;
			
 
				+	struct io_ctl io_ctl;
			
 
				 	struct list_head bitmap_list;
			
 
				 	struct btrfs_key key;
			
 
				 	u64 start, end, len;
			
 
				-	u64 bytes = 0;
			
 
				-	u32 crc = ~(u32)0;
			
 
				-	int index = 0, num_pages = 0;
			
 
				 	int entries = 0;
			
 
				 	int bitmaps = 0;
			
 
				-	int ret = -1;
			
 
				-	bool next_page = false;
			
 
				-	bool out_of_space = false;
			
 
				+	int ret;
			
 
				+	int err = -1;
			
 
				 
			
 
				 	INIT_LIST_HEAD(&bitmap_list);
			
 
				 
			
 
				-	node = rb_first(&ctl->free_space_offset);
			
 
				-	if (!node)
			
 
				-		return 0;
			
 
				-
			
 
				 	if (!i_size_read(inode))
			
 
				 		return -1;
			
 
				 
			
 
				-	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
			
 
				-		PAGE_CACHE_SHIFT;
			
 
				-
			
 
				-	filemap_write_and_wait(inode->i_mapping);
			
 
				-	btrfs_wait_ordered_range(inode, inode->i_size &
			
 
				-				 ~(root->sectorsize - 1), (u64)-1);
			
 
				-
			
 
				-	pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
			
 
				-	if (!pages)
			
 
				-		return -1;
			
 
				+	io_ctl_init(&io_ctl, inode, root);
			
 
				 
			
 
				 	/* Get the cluster for this block_group if it exists */
			
 
				 	if (block_group && !list_empty(&block_group->cluster_list))
			
@@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 	 */
			
 
				 	unpin = root->fs_info->pinned_extents;
			
 
				 
			
 
				-	/*
			
 
				-	 * Lock all pages first so we can lock the extent safely.
			
 
				-	 *
			
 
				-	 * NOTE: Because we hold the ref the entire time we're going to write to
			
 
				-	 * the page find_get_page should never fail, so we don't do a check
			
 
				-	 * after find_get_page at this point.  Just putting this here so people
			
 
				-	 * know and don't freak out.
			
 
				-	 */
			
 
				-	while (index < num_pages) {
			
 
				-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
			
 
				-		if (!page) {
			
 
				-			int i;
			
 
				-
			
 
				-			for (i = 0; i < num_pages; i++) {
			
 
				-				unlock_page(pages[i]);
			
 
				-				page_cache_release(pages[i]);
			
 
				-			}
			
 
				-			goto out;
			
 
				-		}
			
 
				-		pages[index] = page;
			
 
				-		index++;
			
 
				-	}
			
 
				+	/* Lock all pages first so we can lock the extent safely. */
			
 
				+	io_ctl_prepare_pages(&io_ctl, inode, 0);
			
 
				 
			
 
				-	index = 0;
			
 
				 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
			
 
				 			 0, &cached_state, GFP_NOFS);
			
 
				 
			
@@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 	if (block_group)
			
 
				 		start = block_group->key.objectid;
			
 
				 
			
 
				-	/* Write out the extent entries */
			
 
				-	do {
			
 
				-		struct btrfs_free_space_entry *entry;
			
 
				-		void *addr, *orig;
			
 
				-		unsigned long offset = 0;
			
 
				+	node = rb_first(&ctl->free_space_offset);
			
 
				+	if (!node && cluster) {
			
 
				+		node = rb_first(&cluster->root);
			
 
				+		cluster = NULL;
			
 
				+	}
			
 
				 
			
 
				-		next_page = false;
			
 
				+	/* Make sure we can fit our crcs into the first page */
			
 
				+	if (io_ctl.check_crcs &&
			
 
				+	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
			
 
				+		WARN_ON(1);
			
 
				+		goto out_nospc;
			
 
				+	}
			
 
				 
			
 
				-		if (index >= num_pages) {
			
 
				-			out_of_space = true;
			
 
				-			break;
			
 
				-		}
			
 
				+	io_ctl_set_generation(&io_ctl, trans->transid);
			
 
				 
			
 
				-		page = pages[index];
			
 
				+	/* Write out the extent entries */
			
 
				+	while (node) {
			
 
				+		struct btrfs_free_space *e;
			
 
				 
			
 
				-		orig = addr = kmap(page);
			
 
				-		if (index == 0) {
			
 
				-			u64 *gen;
			
 
				+		e = rb_entry(node, struct btrfs_free_space, offset_index);
			
 
				+		entries++;
			
 
				 
			
 
				-			/*
			
 
				-			 * We're going to put in a bogus crc for this page to
			
 
				-			 * make sure that old kernels who aren't aware of this
			
 
				-			 * format will be sure to discard the cache.
			
 
				-			 */
			
 
				-			addr += sizeof(u64);
			
 
				-			offset += sizeof(u64);
			
 
				+		ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
			
 
				+				       e->bitmap);
			
 
				+		if (ret)
			
 
				+			goto out_nospc;
			
 
				 
			
 
				-			gen = addr;
			
 
				-			*gen = trans->transid;
			
 
				-			addr += sizeof(u64);
			
 
				-			offset += sizeof(u64);
			
 
				+		if (e->bitmap) {
			
 
				+			list_add_tail(&e->list, &bitmap_list);
			
 
				+			bitmaps++;
			
 
				 		}
			
 
				-		entry = addr;
			
 
				-
			
 
				-		memset(addr, 0, PAGE_CACHE_SIZE - offset);
			
 
				-		while (node && !next_page) {
			
 
				-			struct btrfs_free_space *e;
			
 
				-
			
 
				-			e = rb_entry(node, struct btrfs_free_space, offset_index);
			
 
				-			entries++;
			
 
				-
			
 
				-			entry->offset = cpu_to_le64(e->offset);
			
 
				-			entry->bytes = cpu_to_le64(e->bytes);
			
 
				-			if (e->bitmap) {
			
 
				-				entry->type = BTRFS_FREE_SPACE_BITMAP;
			
 
				-				list_add_tail(&e->list, &bitmap_list);
			
 
				-				bitmaps++;
			
 
				-			} else {
			
 
				-				entry->type = BTRFS_FREE_SPACE_EXTENT;
			
 
				-			}
			
 
				-			node = rb_next(node);
			
 
				-			if (!node && cluster) {
			
 
				-				node = rb_first(&cluster->root);
			
 
				-				cluster = NULL;
			
 
				-			}
			
 
				-			offset += sizeof(struct btrfs_free_space_entry);
			
 
				-			if (offset + sizeof(struct btrfs_free_space_entry) >=
			
 
				-			    PAGE_CACHE_SIZE)
			
 
				-				next_page = true;
			
 
				-			entry++;
			
 
				+		node = rb_next(node);
			
 
				+		if (!node && cluster) {
			
 
				+			node = rb_first(&cluster->root);
			
 
				+			cluster = NULL;
			
 
				 		}
			
 
				+	}
			
 
				 
			
 
				-		/*
			
 
				-		 * We want to add any pinned extents to our free space cache
			
 
				-		 * so we don't leak the space
			
 
				-		 */
			
 
				-		while (block_group && !next_page &&
			
 
				-		       (start < block_group->key.objectid +
			
 
				-			block_group->key.offset)) {
			
 
				-			ret = find_first_extent_bit(unpin, start, &start, &end,
			
 
				-						    EXTENT_DIRTY);
			
 
				-			if (ret) {
			
 
				-				ret = 0;
			
 
				-				break;
			
 
				-			}
			
 
				-
			
 
				-			/* This pinned extent is out of our range */
			
 
				-			if (start >= block_group->key.objectid +
			
 
				-			    block_group->key.offset)
			
 
				-				break;
			
 
				-
			
 
				-			len = block_group->key.objectid +
			
 
				-				block_group->key.offset - start;
			
 
				-			len = min(len, end + 1 - start);
			
 
				-
			
 
				-			entries++;
			
 
				-			entry->offset = cpu_to_le64(start);
			
 
				-			entry->bytes = cpu_to_le64(len);
			
 
				-			entry->type = BTRFS_FREE_SPACE_EXTENT;
			
 
				-
			
 
				-			start = end + 1;
			
 
				-			offset += sizeof(struct btrfs_free_space_entry);
			
 
				-			if (offset + sizeof(struct btrfs_free_space_entry) >=
			
 
				-			    PAGE_CACHE_SIZE)
			
 
				-				next_page = true;
			
 
				-			entry++;
			
 
				+	/*
			
 
				+	 * We want to add any pinned extents to our free space cache
			
 
				+	 * so we don't leak the space
			
 
				+	 */
			
 
				+	while (block_group && (start < block_group->key.objectid +
			
 
				+			       block_group->key.offset)) {
			
 
				+		ret = find_first_extent_bit(unpin, start, &start, &end,
			
 
				+					    EXTENT_DIRTY);
			
 
				+		if (ret) {
			
 
				+			ret = 0;
			
 
				+			break;
			
 
				 		}
			
 
				 
			
 
				-		/* Generate bogus crc value */
			
 
				-		if (index == 0) {
			
 
				-			u32 *tmp;
			
 
				-			crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
			
 
				-					      PAGE_CACHE_SIZE - sizeof(u64));
			
 
				-			btrfs_csum_final(crc, (char *)&crc);
			
 
				-			crc++;
			
 
				-			tmp = orig;
			
 
				-			*tmp = crc;
			
 
				-		}
			
 
				+		/* This pinned extent is out of our range */
			
 
				+		if (start >= block_group->key.objectid +
			
 
				+		    block_group->key.offset)
			
 
				+			break;
			
 
				 
			
 
				-		kunmap(page);
			
 
				+		len = block_group->key.objectid +
			
 
				+			block_group->key.offset - start;
			
 
				+		len = min(len, end + 1 - start);
			
 
				 
			
 
				-		bytes += PAGE_CACHE_SIZE;
			
 
				+		entries++;
			
 
				+		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
			
 
				+		if (ret)
			
 
				+			goto out_nospc;
			
 
				 
			
 
				-		index++;
			
 
				-	} while (node || next_page);
			
 
				+		start = end + 1;
			
 
				+	}
			
 
				 
			
 
				 	/* Write out the bitmaps */
			
 
				 	list_for_each_safe(pos, n, &bitmap_list) {
			
 
				-		void *addr;
			
 
				 		struct btrfs_free_space *entry =
			
 
				 			list_entry(pos, struct btrfs_free_space, list);
			
 
				 
			
 
				-		if (index >= num_pages) {
			
 
				-			out_of_space = true;
			
 
				-			break;
			
 
				-		}
			
 
				-		page = pages[index];
			
 
				-
			
 
				-		addr = kmap(page);
			
 
				-		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
			
 
				-		kunmap(page);
			
 
				-		bytes += PAGE_CACHE_SIZE;
			
 
				-
			
 
				+		ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
			
 
				+		if (ret)
			
 
				+			goto out_nospc;
			
 
				 		list_del_init(&entry->list);
			
 
				-		index++;
			
 
				-	}
			
 
				-
			
 
				-	if (out_of_space) {
			
 
				-		btrfs_drop_pages(pages, num_pages);
			
 
				-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
			
 
				-				     i_size_read(inode) - 1, &cached_state,
			
 
				-				     GFP_NOFS);
			
 
				-		ret = 0;
			
 
				-		goto out;
			
 
				 	}
			
 
				 
			
 
				 	/* Zero out the rest of the pages just to make sure */
			
 
				-	while (index < num_pages) {
			
 
				-		void *addr;
			
 
				-
			
 
				-		page = pages[index];
			
 
				-		addr = kmap(page);
			
 
				-		memset(addr, 0, PAGE_CACHE_SIZE);
			
 
				-		kunmap(page);
			
 
				-		bytes += PAGE_CACHE_SIZE;
			
 
				-		index++;
			
 
				-	}
			
 
				+	io_ctl_zero_remaining_pages(&io_ctl);
			
 
				 
			
 
				-	ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
			
 
				-					    bytes, &cached_state);
			
 
				-	btrfs_drop_pages(pages, num_pages);
			
 
				+	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
			
 
				+				0, i_size_read(inode), &cached_state);
			
 
				+	io_ctl_drop_pages(&io_ctl);
			
 
				 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
			
 
				 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
			
 
				 
			
 
				-	if (ret) {
			
 
				-		ret = 0;
			
 
				+	if (ret)
			
 
				 		goto out;
			
 
				-	}
			
 
				 
			
 
				-	BTRFS_I(inode)->generation = trans->transid;
			
 
				 
			
 
				-	filemap_write_and_wait(inode->i_mapping);
			
 
				+	ret = filemap_write_and_wait(inode->i_mapping);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				 
			
 
				 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
			
 
				 	key.offset = offset;
			
 
				 	key.type = 0;
			
 
				 
			
 
				-	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
			
 
				+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
			
 
				 	if (ret < 0) {
			
 
				-		ret = -1;
			
 
				-		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
			
 
				-				 EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				-				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
			
 
				+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
			
 
				+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
			
 
				+				 GFP_NOFS);
			
 
				 		goto out;
			
 
				 	}
			
 
				 	leaf = path->nodes[0];
			
@@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
			
 
				 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
			
 
				 		    found_key.offset != offset) {
			
 
				-			ret = -1;
			
 
				-			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
			
 
				-					 EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				-					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
			
 
				-					 GFP_NOFS);
			
 
				+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
			
 
				+					 inode->i_size - 1,
			
 
				+					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
			
 
				+					 NULL, GFP_NOFS);
			
 
				 			btrfs_release_path(path);
			
 
				 			goto out;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	BTRFS_I(inode)->generation = trans->transid;
			
 
				 	header = btrfs_item_ptr(leaf, path->slots[0],
			
 
				 				struct btrfs_free_space_header);
			
 
				 	btrfs_set_free_space_entries(leaf, header, entries);
			
@@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 	btrfs_mark_buffer_dirty(leaf);
			
 
				 	btrfs_release_path(path);
			
 
				 
			
 
				-	ret = 1;
			
 
				-
			
 
				+	err = 0;
			
 
				 out:
			
 
				-	kfree(pages);
			
 
				-	if (ret != 1) {
			
 
				-		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
			
 
				+	io_ctl_free(&io_ctl);
			
 
				+	if (err) {
			
 
				+		invalidate_inode_pages2(inode->i_mapping);
			
 
				 		BTRFS_I(inode)->generation = 0;
			
 
				 	}
			
 
				 	btrfs_update_inode(trans, root, inode);
			
 
				-	return ret;
			
 
				+	return err;
			
 
				+
			
 
				+out_nospc:
			
 
				+	list_for_each_safe(pos, n, &bitmap_list) {
			
 
				+		struct btrfs_free_space *entry =
			
 
				+			list_entry(pos, struct btrfs_free_space, list);
			
 
				+		list_del_init(&entry->list);
			
 
				+	}
			
 
				+	io_ctl_drop_pages(&io_ctl);
			
 
				+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
			
 
				+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
			
 
				+	goto out;
			
 
				 }
			
 
				 
			
 
				 int btrfs_write_out_cache(struct btrfs_root *root,
			
@@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 
				 
			
 
				 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
			
 
				 				      path, block_group->key.objectid);
			
 
				-	if (ret < 0) {
			
 
				+	if (ret) {
			
 
				 		spin_lock(&block_group->lock);
			
 
				 		block_group->disk_cache_state = BTRFS_DC_ERROR;
			
 
				 		spin_unlock(&block_group->lock);
			
 
				 		ret = 0;
			
 
				-
			
 
				+#ifdef DEBUG
			
 
				 		printk(KERN_ERR "btrfs: failed to write free space cace "
			
 
				 		       "for block group %llu\n", block_group->key.objectid);
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	iput(inode);
			
@@ -1701,6 +1880,7 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 
				 			ctl->total_bitmaps--;
			
 
				 		}
			
 
				 		kmem_cache_free(btrfs_free_space_cachep, info);
			
 
				+		ret = 0;
			
 
				 		goto out_lock;
			
 
				 	}
			
 
				 
			
@@ -1708,7 +1888,8 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 
				 		unlink_free_space(ctl, info);
			
 
				 		info->offset += bytes;
			
 
				 		info->bytes -= bytes;
			
 
				-		link_free_space(ctl, info);
			
 
				+		ret = link_free_space(ctl, info);
			
 
				+		WARN_ON(ret);
			
 
				 		goto out_lock;
			
 
				 	}
			
 
				 
			
@@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 
				 		spin_unlock(&ctl->tree_lock);
			
 
				 
			
 
				 		if (bytes >= minlen) {
			
 
				-			int update_ret;
			
 
				-			update_ret = btrfs_update_reserved_bytes(block_group,
			
 
				-								 bytes, 1, 1);
			
 
				+			struct btrfs_space_info *space_info;
			
 
				+			int update = 0;
			
 
				+
			
 
				+			space_info = block_group->space_info;
			
 
				+			spin_lock(&space_info->lock);
			
 
				+			spin_lock(&block_group->lock);
			
 
				+			if (!block_group->ro) {
			
 
				+				block_group->reserved += bytes;
			
 
				+				space_info->bytes_reserved += bytes;
			
 
				+				update = 1;
			
 
				+			}
			
 
				+			spin_unlock(&block_group->lock);
			
 
				+			spin_unlock(&space_info->lock);
			
 
				 
			
 
				 			ret = btrfs_error_discard_extent(fs_info->extent_root,
			
 
				 							 start,
			
@@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 
				 							 &actually_trimmed);
			
 
				 
			
 
				 			btrfs_add_free_space(block_group, start, bytes);
			
 
				-			if (!update_ret)
			
 
				-				btrfs_update_reserved_bytes(block_group,
			
 
				-							    bytes, 0, 1);
			
 
				+			if (update) {
			
 
				+				spin_lock(&space_info->lock);
			
 
				+				spin_lock(&block_group->lock);
			
 
				+				if (block_group->ro)
			
 
				+					space_info->bytes_readonly += bytes;
			
 
				+				block_group->reserved -= bytes;
			
 
				+				space_info->bytes_reserved -= bytes;
			
 
				+				spin_unlock(&space_info->lock);
			
 
				+				spin_unlock(&block_group->lock);
			
 
				+			}
			
 
				 
			
 
				 			if (ret)
			
 
				 				break;
			
@@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 
				 		return 0;
			
 
				 
			
 
				 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
			
 
				-	if (ret < 0)
			
 
				+	if (ret) {
			
 
				+		btrfs_delalloc_release_metadata(inode, inode->i_size);
			
 
				+#ifdef DEBUG
			
 
				 		printk(KERN_ERR "btrfs: failed to write free ino cache "
			
 
				 		       "for root %llu\n", root->root_key.objectid);
			
 
				+#endif
			
 
				+	}
			
 
				 
			
 
				 	iput(inode);
			
 
				 	return ret;
			
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 
				 	/* Just to make sure we have enough space */
			
 
				 	prealloc += 8 * PAGE_CACHE_SIZE;
			
 
				 
			
 
				-	ret = btrfs_check_data_free_space(inode, prealloc);
			
 
				+	ret = btrfs_delalloc_reserve_space(inode, prealloc);
			
 
				 	if (ret)
			
 
				 		goto out_put;
			
 
				 
			
 
				 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
			
 
				 					      prealloc, prealloc, &alloc_hint);
			
 
				-	if (ret)
			
 
				+	if (ret) {
			
 
				+		btrfs_delalloc_release_space(inode, prealloc);
			
 
				 		goto out_put;
			
 
				+	}
			
 
				 	btrfs_free_reserved_data_space(inode, prealloc);
			
 
				 
			
 
				 out_put:
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
 
				 #include "btrfs_inode.h"
			
 
				 #include "ioctl.h"
			
 
				 #include "print-tree.h"
			
 
				-#include "volumes.h"
			
 
				 #include "ordered-data.h"
			
 
				 #include "xattr.h"
			
 
				 #include "tree-log.h"
			
 
				+#include "volumes.h"
			
 
				 #include "compression.h"
			
 
				 #include "locking.h"
			
 
				 #include "free-space-cache.h"
			
@@ -393,7 +393,10 @@ static noinline int compress_file_range(struct inode *inode,
 
				 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
			
 
				 		WARN_ON(pages);
			
 
				 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
			
 
				-		BUG_ON(!pages);
			
 
				+		if (!pages) {
			
 
				+			/* just bail out to the uncompressed code */
			
 
				+			goto cont;
			
 
				+		}
			
 
				 
			
 
				 		if (BTRFS_I(inode)->force_compress)
			
 
				 			compress_type = BTRFS_I(inode)->force_compress;
			
@@ -424,6 +427,7 @@ static noinline int compress_file_range(struct inode *inode,
 
				 			will_compress = 1;
			
 
				 		}
			
 
				 	}
			
 
				+cont:
			
 
				 	if (start == 0) {
			
 
				 		trans = btrfs_join_transaction(root);
			
 
				 		BUG_ON(IS_ERR(trans));
			
@@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
 
				 	}
			
 
				 
			
 
				 	BUG_ON(disk_num_bytes >
			
 
				-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
			
 
				+	       btrfs_super_total_bytes(root->fs_info->super_copy));
			
 
				 
			
 
				 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
			
 
				 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
			
@@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
				 	}
			
 
				 	ret = 0;
			
 
				 out:
			
 
				-	if (nolock) {
			
 
				-		if (trans)
			
 
				-			btrfs_end_transaction_nolock(trans, root);
			
 
				-	} else {
			
 
				+	if (root != root->fs_info->tree_root)
			
 
				 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
			
 
				-		if (trans)
			
 
				+	if (trans) {
			
 
				+		if (nolock)
			
 
				+			btrfs_end_transaction_nolock(trans, root);
			
 
				+		else
			
 
				 			btrfs_end_transaction(trans, root);
			
 
				 	}
			
 
				 
			
@@ -1818,154 +1822,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 
				 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * When IO fails, either with EIO or csum verification fails, we
			
 
				- * try other mirrors that might have a good copy of the data.  This
			
 
				- * io_failure_record is used to record state as we go through all the
			
 
				- * mirrors.  If another mirror has good data, the page is set up to date
			
 
				- * and things continue.  If a good mirror can't be found, the original
			
 
				- * bio end_io callback is called to indicate things have failed.
			
 
				- */
			
 
				-struct io_failure_record {
			
 
				-	struct page *page;
			
 
				-	u64 start;
			
 
				-	u64 len;
			
 
				-	u64 logical;
			
 
				-	unsigned long bio_flags;
			
 
				-	int last_mirror;
			
 
				-};
			
 
				-
			
 
				-static int btrfs_io_failed_hook(struct bio *failed_bio,
			
 
				-			 struct page *page, u64 start, u64 end,
			
 
				-			 struct extent_state *state)
			
 
				-{
			
 
				-	struct io_failure_record *failrec = NULL;
			
 
				-	u64 private;
			
 
				-	struct extent_map *em;
			
 
				-	struct inode *inode = page->mapping->host;
			
 
				-	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
			
 
				-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				-	struct bio *bio;
			
 
				-	int num_copies;
			
 
				-	int ret;
			
 
				-	int rw;
			
 
				-	u64 logical;
			
 
				-
			
 
				-	ret = get_state_private(failure_tree, start, &private);
			
 
				-	if (ret) {
			
 
				-		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
			
 
				-		if (!failrec)
			
 
				-			return -ENOMEM;
			
 
				-		failrec->start = start;
			
 
				-		failrec->len = end - start + 1;
			
 
				-		failrec->last_mirror = 0;
			
 
				-		failrec->bio_flags = 0;
			
 
				-
			
 
				-		read_lock(&em_tree->lock);
			
 
				-		em = lookup_extent_mapping(em_tree, start, failrec->len);
			
 
				-		if (em->start > start || em->start + em->len < start) {
			
 
				-			free_extent_map(em);
			
 
				-			em = NULL;
			
 
				-		}
			
 
				-		read_unlock(&em_tree->lock);
			
 
				-
			
 
				-		if (IS_ERR_OR_NULL(em)) {
			
 
				-			kfree(failrec);
			
 
				-			return -EIO;
			
 
				-		}
			
 
				-		logical = start - em->start;
			
 
				-		logical = em->block_start + logical;
			
 
				-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
			
 
				-			logical = em->block_start;
			
 
				-			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
			
 
				-			extent_set_compress_type(&failrec->bio_flags,
			
 
				-						 em->compress_type);
			
 
				-		}
			
 
				-		failrec->logical = logical;
			
 
				-		free_extent_map(em);
			
 
				-		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
			
 
				-				EXTENT_DIRTY, GFP_NOFS);
			
 
				-		set_state_private(failure_tree, start,
			
 
				-				 (u64)(unsigned long)failrec);
			
 
				-	} else {
			
 
				-		failrec = (struct io_failure_record *)(unsigned long)private;
			
 
				-	}
			
 
				-	num_copies = btrfs_num_copies(
			
 
				-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
			
 
				-			      failrec->logical, failrec->len);
			
 
				-	failrec->last_mirror++;
			
 
				-	if (!state) {
			
 
				-		spin_lock(&BTRFS_I(inode)->io_tree.lock);
			
 
				-		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
			
 
				-						    failrec->start,
			
 
				-						    EXTENT_LOCKED);
			
 
				-		if (state && state->start != failrec->start)
			
 
				-			state = NULL;
			
 
				-		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
			
 
				-	}
			
 
				-	if (!state || failrec->last_mirror > num_copies) {
			
 
				-		set_state_private(failure_tree, failrec->start, 0);
			
 
				-		clear_extent_bits(failure_tree, failrec->start,
			
 
				-				  failrec->start + failrec->len - 1,
			
 
				-				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
			
 
				-		kfree(failrec);
			
 
				-		return -EIO;
			
 
				-	}
			
 
				-	bio = bio_alloc(GFP_NOFS, 1);
			
 
				-	bio->bi_private = state;
			
 
				-	bio->bi_end_io = failed_bio->bi_end_io;
			
 
				-	bio->bi_sector = failrec->logical >> 9;
			
 
				-	bio->bi_bdev = failed_bio->bi_bdev;
			
 
				-	bio->bi_size = 0;
			
 
				-
			
 
				-	bio_add_page(bio, page, failrec->len, start - page_offset(page));
			
 
				-	if (failed_bio->bi_rw & REQ_WRITE)
			
 
				-		rw = WRITE;
			
 
				-	else
			
 
				-		rw = READ;
			
 
				-
			
 
				-	ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
			
 
				-						      failrec->last_mirror,
			
 
				-						      failrec->bio_flags, 0);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * each time an IO finishes, we do a fast check in the IO failure tree
			
 
				- * to see if we need to process or clean up an io_failure_record
			
 
				- */
			
 
				-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
			
 
				-{
			
 
				-	u64 private;
			
 
				-	u64 private_failure;
			
 
				-	struct io_failure_record *failure;
			
 
				-	int ret;
			
 
				-
			
 
				-	private = 0;
			
 
				-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
			
 
				-			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
			
 
				-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
			
 
				-					start, &private_failure);
			
 
				-		if (ret == 0) {
			
 
				-			failure = (struct io_failure_record *)(unsigned long)
			
 
				-				   private_failure;
			
 
				-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
			
 
				-					  failure->start, 0);
			
 
				-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
			
 
				-					  failure->start,
			
 
				-					  failure->start + failure->len - 1,
			
 
				-					  EXTENT_DIRTY | EXTENT_LOCKED,
			
 
				-					  GFP_NOFS);
			
 
				-			kfree(failure);
			
 
				-		}
			
 
				-	}
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * when reads are done, we need to check csums to verify the data is correct
			
 
				- * if there's a match, we allow the bio to finish.  If not, we go through
			
 
				- * the io_failure_record routines to find good copies
			
 
				+ * if there's a match, we allow the bio to finish.  If not, the code in
			
 
				+ * extent_io.c will try to find good copies for us.
			
 
				  */
			
 
				 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
			
 
				 			       struct extent_state *state)
			
@@ -2011,10 +1871,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
				 
			
 
				 	kunmap_atomic(kaddr, KM_USER0);
			
 
				 good:
			
 
				-	/* if the io failure tree for this inode is non-empty,
			
 
				-	 * check to see if we've recovered from a failed IO
			
 
				-	 */
			
 
				-	btrfs_clean_io_failures(inode, start);
			
 
				 	return 0;
			
 
				 
			
 
				 zeroit:
			
@@ -2079,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 
				 	up_read(&root->fs_info->cleanup_work_sem);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * calculate extra metadata reservation when snapshotting a subvolume
			
 
				- * contains orphan files.
			
 
				- */
			
 
				-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
			
 
				-				struct btrfs_pending_snapshot *pending,
			
 
				-				u64 *bytes_to_reserve)
			
 
				-{
			
 
				-	struct btrfs_root *root;
			
 
				-	struct btrfs_block_rsv *block_rsv;
			
 
				-	u64 num_bytes;
			
 
				-	int index;
			
 
				-
			
 
				-	root = pending->root;
			
 
				-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
			
 
				-		return;
			
 
				-
			
 
				-	block_rsv = root->orphan_block_rsv;
			
 
				-
			
 
				-	/* orphan block reservation for the snapshot */
			
 
				-	num_bytes = block_rsv->size;
			
 
				-
			
 
				-	/*
			
 
				-	 * after the snapshot is created, COWing tree blocks may use more
			
 
				-	 * space than it frees. So we should make sure there is enough
			
 
				-	 * reserved space.
			
 
				-	 */
			
 
				-	index = trans->transid & 0x1;
			
 
				-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
			
 
				-		num_bytes += block_rsv->size -
			
 
				-			     (block_rsv->reserved + block_rsv->freed[index]);
			
 
				-	}
			
 
				-
			
 
				-	*bytes_to_reserve += num_bytes;
			
 
				-}
			
 
				-
			
 
				-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
			
 
				-				struct btrfs_pending_snapshot *pending)
			
 
				-{
			
 
				-	struct btrfs_root *root = pending->root;
			
 
				-	struct btrfs_root *snap = pending->snap;
			
 
				-	struct btrfs_block_rsv *block_rsv;
			
 
				-	u64 num_bytes;
			
 
				-	int index;
			
 
				-	int ret;
			
 
				-
			
 
				-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
			
 
				-		return;
			
 
				-
			
 
				-	/* refill source subvolume's orphan block reservation */
			
 
				-	block_rsv = root->orphan_block_rsv;
			
 
				-	index = trans->transid & 0x1;
			
 
				-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
			
 
				-		num_bytes = block_rsv->size -
			
 
				-			    (block_rsv->reserved + block_rsv->freed[index]);
			
 
				-		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
			
 
				-					      root->orphan_block_rsv,
			
 
				-					      num_bytes);
			
 
				-		BUG_ON(ret);
			
 
				-	}
			
 
				-
			
 
				-	/* setup orphan block reservation for the snapshot */
			
 
				-	block_rsv = btrfs_alloc_block_rsv(snap);
			
 
				-	BUG_ON(!block_rsv);
			
 
				-
			
 
				-	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
			
 
				-	snap->orphan_block_rsv = block_rsv;
			
 
				-
			
 
				-	num_bytes = root->orphan_block_rsv->size;
			
 
				-	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
			
 
				-				      block_rsv, num_bytes);
			
 
				-	BUG_ON(ret);
			
 
				-
			
 
				-#if 0
			
 
				-	/* insert orphan item for the snapshot */
			
 
				-	WARN_ON(!root->orphan_item_inserted);
			
 
				-	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
			
 
				-				       snap->root_key.objectid);
			
 
				-	BUG_ON(ret);
			
 
				-	snap->orphan_item_inserted = 1;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 enum btrfs_orphan_cleanup_state {
			
 
				 	ORPHAN_CLEANUP_STARTED	= 1,
			
 
				 	ORPHAN_CLEANUP_DONE	= 2,
			
@@ -2247,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 
				 	}
			
 
				 	spin_unlock(&root->orphan_lock);
			
 
				 
			
 
				-	if (block_rsv)
			
 
				-		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
			
 
				-
			
 
				 	/* grab metadata reservation from transaction handle */
			
 
				 	if (reserve) {
			
 
				 		ret = btrfs_orphan_reserve_metadata(trans, inode);
			
@@ -2316,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
				 	struct btrfs_key key, found_key;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct inode *inode;
			
 
				+	u64 last_objectid = 0;
			
 
				 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
			
 
				 
			
 
				 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
			
@@ -2367,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
				 		 * crossing root thing.  we store the inode number in the
			
 
				 		 * offset of the orphan item.
			
 
				 		 */
			
 
				+
			
 
				+		if (found_key.offset == last_objectid) {
			
 
				+			printk(KERN_ERR "btrfs: Error removing orphan entry, "
			
 
				+			       "stopping orphan cleanup\n");
			
 
				+			ret = -EINVAL;
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		last_objectid = found_key.offset;
			
 
				+
			
 
				 		found_key.objectid = found_key.offset;
			
 
				 		found_key.type = BTRFS_INODE_ITEM_KEY;
			
 
				 		found_key.offset = 0;
			
 
				 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
			
 
				-		if (IS_ERR(inode)) {
			
 
				-			ret = PTR_ERR(inode);
			
 
				+		ret = PTR_RET(inode);
			
 
				+		if (ret && ret != -ESTALE)
			
 
				 			goto out;
			
 
				-		}
			
 
				 
			
 
				 		/*
			
 
				-		 * add this inode to the orphan list so btrfs_orphan_del does
			
 
				-		 * the proper thing when we hit it
			
 
				+		 * Inode is already gone but the orphan item is still there,
			
 
				+		 * kill the orphan item.
			
 
				 		 */
			
 
				-		spin_lock(&root->orphan_lock);
			
 
				-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
			
 
				-		spin_unlock(&root->orphan_lock);
			
 
				-
			
 
				-		/*
			
 
				-		 * if this is a bad inode, means we actually succeeded in
			
 
				-		 * removing the inode, but not the orphan record, which means
			
 
				-		 * we need to manually delete the orphan since iput will just
			
 
				-		 * do a destroy_inode
			
 
				-		 */
			
 
				-		if (is_bad_inode(inode)) {
			
 
				-			trans = btrfs_start_transaction(root, 0);
			
 
				+		if (ret == -ESTALE) {
			
 
				+			trans = btrfs_start_transaction(root, 1);
			
 
				 			if (IS_ERR(trans)) {
			
 
				 				ret = PTR_ERR(trans);
			
 
				 				goto out;
			
 
				 			}
			
 
				-			btrfs_orphan_del(trans, inode);
			
 
				+			ret = btrfs_del_orphan_item(trans, root,
			
 
				+						    found_key.objectid);
			
 
				+			BUG_ON(ret);
			
 
				 			btrfs_end_transaction(trans, root);
			
 
				-			iput(inode);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				+		/*
			
 
				+		 * add this inode to the orphan list so btrfs_orphan_del does
			
 
				+		 * the proper thing when we hit it
			
 
				+		 */
			
 
				+		spin_lock(&root->orphan_lock);
			
 
				+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
			
 
				+		spin_unlock(&root->orphan_lock);
			
 
				+
			
 
				 		/* if we have links, this was a truncate, lets do that */
			
 
				 		if (inode->i_nlink) {
			
 
				 			if (!S_ISREG(inode->i_mode)) {
			
@@ -2835,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 
				 	u64 ino = btrfs_ino(inode);
			
 
				 	u64 dir_ino = btrfs_ino(dir);
			
 
				 
			
 
				-	trans = btrfs_start_transaction(root, 10);
			
 
				+	/*
			
 
				+	 * 1 for the possible orphan item
			
 
				+	 * 1 for the dir item
			
 
				+	 * 1 for the dir index
			
 
				+	 * 1 for the inode ref
			
 
				+	 * 1 for the inode ref in the tree log
			
 
				+	 * 2 for the dir entries in the log
			
 
				+	 * 1 for the inode
			
 
				+	 */
			
 
				+	trans = btrfs_start_transaction(root, 8);
			
 
				 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
			
 
				 		return trans;
			
 
				 
			
@@ -2858,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 	}
			
 
				 
			
 
				-	trans = btrfs_start_transaction(root, 0);
			
 
				+	/* 1 for the orphan item */
			
 
				+	trans = btrfs_start_transaction(root, 1);
			
 
				 	if (IS_ERR(trans)) {
			
 
				 		btrfs_free_path(path);
			
 
				 		root->fs_info->enospc_unlink = 0;
			
@@ -2963,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 
				 	err = 0;
			
 
				 out:
			
 
				 	btrfs_free_path(path);
			
 
				+	/* Migrate the orphan reservation over */
			
 
				+	if (!err)
			
 
				+		err = btrfs_block_rsv_migrate(trans->block_rsv,
			
 
				+				&root->fs_info->global_block_rsv,
			
 
				+				trans->bytes_reserved);
			
 
				+
			
 
				 	if (err) {
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				 		root->fs_info->enospc_unlink = 0;
			
@@ -2977,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 
				 			       struct btrfs_root *root)
			
 
				 {
			
 
				 	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
			
 
				+		btrfs_block_rsv_release(root, trans->block_rsv,
			
 
				+					trans->bytes_reserved);
			
 
				+		trans->block_rsv = &root->fs_info->trans_block_rsv;
			
 
				 		BUG_ON(!root->fs_info->enospc_unlink);
			
 
				 		root->fs_info->enospc_unlink = 0;
			
 
				 	}
			
@@ -3368,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
				 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
			
 
				 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
			
 
				 	struct page *page;
			
 
				+	gfp_t mask = btrfs_alloc_write_mask(mapping);
			
 
				 	int ret = 0;
			
 
				 	u64 page_start;
			
 
				 	u64 page_end;
			
@@ -3380,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
				 
			
 
				 	ret = -ENOMEM;
			
 
				 again:
			
 
				-	page = find_or_create_page(mapping, index, GFP_NOFS);
			
 
				+	page = find_or_create_page(mapping, index, mask);
			
 
				 	if (!page) {
			
 
				 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
			
 
				 		goto out;
			
@@ -3613,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)
 
				 {
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+	struct btrfs_block_rsv *rsv, *global_rsv;
			
 
				+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
			
 
				 	unsigned long nr;
			
 
				 	int ret;
			
 
				 
			
@@ -3640,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)
 
				 		goto no_delete;
			
 
				 	}
			
 
				 
			
 
				+	rsv = btrfs_alloc_block_rsv(root);
			
 
				+	if (!rsv) {
			
 
				+		btrfs_orphan_del(NULL, inode);
			
 
				+		goto no_delete;
			
 
				+	}
			
 
				+	rsv->size = min_size;
			
 
				+	global_rsv = &root->fs_info->global_block_rsv;
			
 
				+
			
 
				 	btrfs_i_size_write(inode, 0);
			
 
				 
			
 
				+	/*
			
 
				+	 * This is a bit simpler than btrfs_truncate since
			
 
				+	 *
			
 
				+	 * 1) We've already reserved our space for our orphan item in the
			
 
				+	 *    unlink.
			
 
				+	 * 2) We're going to delete the inode item, so we don't need to update
			
 
				+	 *    it at all.
			
 
				+	 *
			
 
				+	 * So we just need to reserve some slack space in case we add bytes when
			
 
				+	 * doing the truncate.
			
 
				+	 */
			
 
				 	while (1) {
			
 
				-		trans = btrfs_join_transaction(root);
			
 
				-		BUG_ON(IS_ERR(trans));
			
 
				-		trans->block_rsv = root->orphan_block_rsv;
			
 
				+		ret = btrfs_block_rsv_refill(root, rsv, min_size);
			
 
				+
			
 
				+		/*
			
 
				+		 * Try and steal from the global reserve since we will
			
 
				+		 * likely not use this space anyway, we want to try as
			
 
				+		 * hard as possible to get this to work.
			
 
				+		 */
			
 
				+		if (ret)
			
 
				+			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
			
 
				 
			
 
				-		ret = btrfs_block_rsv_check(trans, root,
			
 
				-					    root->orphan_block_rsv, 0, 5);
			
 
				 		if (ret) {
			
 
				-			BUG_ON(ret != -EAGAIN);
			
 
				-			ret = btrfs_commit_transaction(trans, root);
			
 
				-			BUG_ON(ret);
			
 
				-			continue;
			
 
				+			printk(KERN_WARNING "Could not get space for a "
			
 
				+			       "delete, will truncate on mount %d\n", ret);
			
 
				+			btrfs_orphan_del(NULL, inode);
			
 
				+			btrfs_free_block_rsv(root, rsv);
			
 
				+			goto no_delete;
			
 
				 		}
			
 
				 
			
 
				+		trans = btrfs_start_transaction(root, 0);
			
 
				+		if (IS_ERR(trans)) {
			
 
				+			btrfs_orphan_del(NULL, inode);
			
 
				+			btrfs_free_block_rsv(root, rsv);
			
 
				+			goto no_delete;
			
 
				+		}
			
 
				+
			
 
				+		trans->block_rsv = rsv;
			
 
				+
			
 
				 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
			
 
				 		if (ret != -EAGAIN)
			
 
				 			break;
			
@@ -3664,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)
 
				 		btrfs_end_transaction(trans, root);
			
 
				 		trans = NULL;
			
 
				 		btrfs_btree_balance_dirty(root, nr);
			
 
				-
			
 
				 	}
			
 
				 
			
 
				+	btrfs_free_block_rsv(root, rsv);
			
 
				+
			
 
				 	if (ret == 0) {
			
 
				+		trans->block_rsv = root->orphan_block_rsv;
			
 
				 		ret = btrfs_orphan_del(trans, inode);
			
 
				 		BUG_ON(ret);
			
 
				 	}
			
 
				 
			
 
				+	trans->block_rsv = &root->fs_info->trans_block_rsv;
			
 
				 	if (!(root == root->fs_info->tree_root ||
			
 
				 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
			
 
				 		btrfs_return_ino(root, btrfs_ino(inode));
			
@@ -5795,8 +5632,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 
				 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
			
 
				 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
			
 
				 		if (!ret)
			
 
				-			ret = btrfs_update_inode(trans, root, inode);
			
 
				-		err = ret;
			
 
				+			err = btrfs_update_inode(trans, root, inode);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -6289,7 +6125,7 @@ int btrfs_readpage(struct file *file, struct page *page)
 
				 {
			
 
				 	struct extent_io_tree *tree;
			
 
				 	tree = &BTRFS_I(page->mapping->host)->io_tree;
			
 
				-	return extent_read_full_page(tree, page, btrfs_get_extent);
			
 
				+	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
			
 
				 }
			
 
				 
			
 
				 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
			
@@ -6541,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	unsigned long nr;
			
 
				 	u64 mask = root->sectorsize - 1;
			
 
				+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
			
 
				 
			
 
				 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
			
 
				 	if (ret)
			
@@ -6588,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)
 
				 	rsv = btrfs_alloc_block_rsv(root);
			
 
				 	if (!rsv)
			
 
				 		return -ENOMEM;
			
 
				-	btrfs_add_durable_block_rsv(root->fs_info, rsv);
			
 
				+	rsv->size = min_size;
			
 
				 
			
 
				+	/*
			
 
				+	 * 1 for the truncate slack space
			
 
				+	 * 1 for the orphan item we're going to add
			
 
				+	 * 1 for the orphan item deletion
			
 
				+	 * 1 for updating the inode.
			
 
				+	 */
			
 
				 	trans = btrfs_start_transaction(root, 4);
			
 
				 	if (IS_ERR(trans)) {
			
 
				 		err = PTR_ERR(trans);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Reserve space for the truncate process.  Truncate should be adding
			
 
				-	 * space, but if there are snapshots it may end up using space.
			
 
				-	 */
			
 
				-	ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
			
 
				+	/* Migrate the slack space for the truncate to our reserve */
			
 
				+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
			
 
				+				      min_size);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				 	ret = btrfs_orphan_add(trans, inode);
			
@@ -6609,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	nr = trans->blocks_used;
			
 
				-	btrfs_end_transaction(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				-
			
 
				-	/*
			
 
				-	 * Ok so we've already migrated our bytes over for the truncate, so here
			
 
				-	 * just reserve the one slot we need for updating the inode.
			
 
				-	 */
			
 
				-	trans = btrfs_start_transaction(root, 1);
			
 
				-	if (IS_ERR(trans)) {
			
 
				-		err = PTR_ERR(trans);
			
 
				-		goto out;
			
 
				-	}
			
 
				-	trans->block_rsv = rsv;
			
 
				-
			
 
				 	/*
			
 
				 	 * setattr is responsible for setting the ordered_data_close flag,
			
 
				 	 * but that is only tested during the last file release.  That
			
@@ -6645,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)
 
				 		btrfs_add_ordered_operation(trans, root, inode);
			
 
				 
			
 
				 	while (1) {
			
 
				+		ret = btrfs_block_rsv_refill(root, rsv, min_size);
			
 
				+		if (ret) {
			
 
				+			/*
			
 
				+			 * This can only happen with the original transaction we
			
 
				+			 * started above, every other time we shouldn't have a
			
 
				+			 * transaction started yet.
			
 
				+			 */
			
 
				+			if (ret == -EAGAIN)
			
 
				+				goto end_trans;
			
 
				+			err = ret;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				 		if (!trans) {
			
 
				-			trans = btrfs_start_transaction(root, 3);
			
 
				+			/* Just need the 1 for updating the inode */
			
 
				+			trans = btrfs_start_transaction(root, 1);
			
 
				 			if (IS_ERR(trans)) {
			
 
				 				err = PTR_ERR(trans);
			
 
				 				goto out;
			
 
				 			}
			
 
				-
			
 
				-			ret = btrfs_truncate_reserve_metadata(trans, root,
			
 
				-							      rsv);
			
 
				-			BUG_ON(ret);
			
 
				-
			
 
				-			trans->block_rsv = rsv;
			
 
				 		}
			
 
				 
			
 
				+		trans->block_rsv = rsv;
			
 
				+
			
 
				 		ret = btrfs_truncate_inode_items(trans, root, inode,
			
 
				 						 inode->i_size,
			
 
				 						 BTRFS_EXTENT_DATA_KEY);
			
@@ -6673,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)
 
				 			err = ret;
			
 
				 			break;
			
 
				 		}
			
 
				-
			
 
				+end_trans:
			
 
				 		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				 		trans = NULL;
			
@@ -6755,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
				 	ei->last_sub_trans = 0;
			
 
				 	ei->logged_trans = 0;
			
 
				 	ei->delalloc_bytes = 0;
			
 
				-	ei->reserved_bytes = 0;
			
 
				 	ei->disk_i_size = 0;
			
 
				 	ei->flags = 0;
			
 
				+	ei->csum_bytes = 0;
			
 
				 	ei->index_cnt = (u64)-1;
			
 
				 	ei->last_unlink_trans = 0;
			
 
				 
			
@@ -6803,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)
 
				 	WARN_ON(inode->i_data.nrpages);
			
 
				 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
			
 
				 	WARN_ON(BTRFS_I(inode)->reserved_extents);
			
 
				+	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
			
 
				+	WARN_ON(BTRFS_I(inode)->csum_bytes);
			
 
				 
			
 
				 	/*
			
 
				 	 * This can happen where we create an inode, but somebody else also
			
@@ -7420,7 +7258,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 
				 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
			
 
				 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
			
 
				 	.writepage_start_hook = btrfs_writepage_start_hook,
			
 
				-	.readpage_io_failed_hook = btrfs_io_failed_hook,
			
 
				 	.set_bit_hook = btrfs_set_bit_hook,
			
 
				 	.clear_bit_hook = btrfs_clear_bit_hook,
			
 
				 	.merge_extent_hook = btrfs_merge_extent_hook,
			
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
 
				 #include "volumes.h"
			
 
				 #include "locking.h"
			
 
				 #include "inode-map.h"
			
 
				+#include "backref.h"
			
 
				 
			
 
				 /* Mask out flags that are inappropriate for the given type of inode. */
			
 
				 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
			
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
 
				 /*
			
 
				  * Inherit flags from the parent inode.
			
 
				  *
			
 
				- * Unlike extN we don't have any flags we don't want to inherit currently.
			
 
				+ * Currently only the compression flags and the cow flags are inherited.
			
 
				  */
			
 
				 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
			
 
				 {
			
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
				 
			
 
				 	flags = BTRFS_I(dir)->flags;
			
 
				 
			
 
				-	if (S_ISREG(inode->i_mode))
			
 
				-		flags &= ~BTRFS_INODE_DIRSYNC;
			
 
				-	else if (!S_ISDIR(inode->i_mode))
			
 
				-		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
			
 
				+	if (flags & BTRFS_INODE_NOCOMPRESS) {
			
 
				+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
			
 
				+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
			
 
				+	} else if (flags & BTRFS_INODE_COMPRESS) {
			
 
				+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
			
 
				+		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & BTRFS_INODE_NODATACOW)
			
 
				+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
			
 
				 
			
 
				-	BTRFS_I(inode)->flags = flags;
			
 
				 	btrfs_update_iflags(inode);
			
 
				 }
			
 
				 
			
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
				 	struct fstrim_range range;
			
 
				 	u64 minlen = ULLONG_MAX;
			
 
				 	u64 num_devices = 0;
			
 
				+	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
			
 
				 	int ret;
			
 
				 
			
 
				 	if (!capable(CAP_SYS_ADMIN))
			
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
				 		}
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
 
				+
			
 
				 	if (!num_devices)
			
 
				 		return -EOPNOTSUPP;
			
 
				-
			
 
				 	if (copy_from_user(&range, arg, sizeof(range)))
			
 
				 		return -EFAULT;
			
 
				+	if (range.start > total_bytes)
			
 
				+		return -EINVAL;
			
 
				 
			
 
				+	range.len = min(range.len, total_bytes - range.start);
			
 
				 	range.minlen = max(range.minlen, minlen);
			
 
				 	ret = btrfs_trim_fs(root, &range);
			
 
				 	if (ret < 0)
			
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 
				 	int ret = 1;
			
 
				 
			
 
				 	/*
			
 
				-	 * make sure that once we start defragging and extent, we keep on
			
 
				+	 * make sure that once we start defragging an extent, we keep on
			
 
				 	 * defragging it
			
 
				 	 */
			
 
				 	if (start < *defrag_end)
			
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 
				 	 * extent will force at least part of that big extent to be defragged.
			
 
				 	 */
			
 
				 	if (ret) {
			
 
				-		*last_len += len;
			
 
				 		*defrag_end = extent_map_end(em);
			
 
				 	} else {
			
 
				 		*last_len = 0;
			
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
				 	int i_done;
			
 
				 	struct btrfs_ordered_extent *ordered;
			
 
				 	struct extent_state *cached_state = NULL;
			
 
				+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
			
 
				 
			
 
				 	if (isize == 0)
			
 
				 		return 0;
			
@@ -860,7 +870,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
				 	for (i = 0; i < num_pages; i++) {
			
 
				 		struct page *page;
			
 
				 		page = find_or_create_page(inode->i_mapping,
			
 
				-					    start_index + i, GFP_NOFS);
			
 
				+					    start_index + i, mask);
			
 
				 		if (!page)
			
 
				 			break;
			
 
				 
			
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 	struct btrfs_super_block *disk_super;
			
 
				 	struct file_ra_state *ra = NULL;
			
 
				 	unsigned long last_index;
			
 
				+	u64 isize = i_size_read(inode);
			
 
				 	u64 features;
			
 
				 	u64 last_len = 0;
			
 
				 	u64 skip = 0;
			
 
				 	u64 defrag_end = 0;
			
 
				 	u64 newer_off = range->start;
			
 
				-	int newer_left = 0;
			
 
				 	unsigned long i;
			
 
				+	unsigned long ra_index = 0;
			
 
				 	int ret;
			
 
				 	int defrag_count = 0;
			
 
				 	int compress_type = BTRFS_COMPRESS_ZLIB;
			
 
				 	int extent_thresh = range->extent_thresh;
			
 
				-	int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
			
 
				+	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
			
 
				+	int cluster = max_cluster;
			
 
				 	u64 new_align = ~((u64)128 * 1024 - 1);
			
 
				 	struct page **pages = NULL;
			
 
				 
			
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 			compress_type = range->compress_type;
			
 
				 	}
			
 
				 
			
 
				-	if (inode->i_size == 0)
			
 
				+	if (isize == 0)
			
 
				 		return 0;
			
 
				 
			
 
				 	/*
			
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 		ra = &file->f_ra;
			
 
				 	}
			
 
				 
			
 
				-	pages = kmalloc(sizeof(struct page *) * newer_cluster,
			
 
				+	pages = kmalloc(sizeof(struct page *) * max_cluster,
			
 
				 			GFP_NOFS);
			
 
				 	if (!pages) {
			
 
				 		ret = -ENOMEM;
			
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 
			
 
				 	/* find the last page to defrag */
			
 
				 	if (range->start + range->len > range->start) {
			
 
				-		last_index = min_t(u64, inode->i_size - 1,
			
 
				+		last_index = min_t(u64, isize - 1,
			
 
				 			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
			
 
				 	} else {
			
 
				-		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
			
 
				+		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
			
 
				 	}
			
 
				 
			
 
				 	if (newer_than) {
			
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 			 * the extents in the file evenly spaced
			
 
				 			 */
			
 
				 			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
			
 
				-			newer_left = newer_cluster;
			
 
				 		} else
			
 
				 			goto out_ra;
			
 
				 	} else {
			
 
				 		i = range->start >> PAGE_CACHE_SHIFT;
			
 
				 	}
			
 
				 	if (!max_to_defrag)
			
 
				-		max_to_defrag = last_index - 1;
			
 
				+		max_to_defrag = last_index;
			
 
				 
			
 
				 	/*
			
 
				 	 * make writeback starts from i, so the defrag range can be
			
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 			i = max(i + 1, next);
			
 
				 			continue;
			
 
				 		}
			
 
				+
			
 
				+		if (!newer_than) {
			
 
				+			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
			
 
				+				   PAGE_CACHE_SHIFT) - i;
			
 
				+			cluster = min(cluster, max_cluster);
			
 
				+		} else {
			
 
				+			cluster = max_cluster;
			
 
				+		}
			
 
				+
			
 
				 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
			
 
				 			BTRFS_I(inode)->force_compress = compress_type;
			
 
				 
			
 
				-		btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
			
 
				+		if (i + cluster > ra_index) {
			
 
				+			ra_index = max(i, ra_index);
			
 
				+			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
			
 
				+				       cluster);
			
 
				+			ra_index += max_cluster;
			
 
				+		}
			
 
				 
			
 
				-		ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
			
 
				+		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
			
 
				 		if (ret < 0)
			
 
				 			goto out_ra;
			
 
				 
			
 
				 		defrag_count += ret;
			
 
				 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
			
 
				-		i += ret;
			
 
				 
			
 
				 		if (newer_than) {
			
 
				 			if (newer_off == (u64)-1)
			
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 			if (!ret) {
			
 
				 				range->start = newer_off;
			
 
				 				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
			
 
				-				newer_left = newer_cluster;
			
 
				 			} else {
			
 
				 				break;
			
 
				 			}
			
 
				 		} else {
			
 
				-			i++;
			
 
				+			if (ret > 0) {
			
 
				+				i += ret;
			
 
				+				last_len += ret << PAGE_CACHE_SHIFT;
			
 
				+			} else {
			
 
				+				i++;
			
 
				+				last_len = 0;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 		mutex_unlock(&inode->i_mutex);
			
 
				 	}
			
 
				 
			
 
				-	disk_super = &root->fs_info->super_copy;
			
 
				+	disk_super = root->fs_info->super_copy;
			
 
				 	features = btrfs_super_incompat_flags(disk_super);
			
 
				 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
			
 
				 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
			
 
				 		btrfs_set_super_incompat_flags(disk_super, features);
			
 
				 	}
			
 
				 
			
 
				-	if (!file)
			
 
				-		kfree(ra);
			
 
				-	return defrag_count;
			
 
				+	ret = defrag_count;
			
 
				 
			
 
				 out_ra:
			
 
				 	if (!file)
			
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
				 		return PTR_ERR(trans);
			
 
				 	}
			
 
				 
			
 
				-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
			
 
				+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
			
 
				 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
			
 
				 				   dir_id, "default", 7, 1);
			
 
				 	if (IS_ERR_OR_NULL(di)) {
			
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
				 	btrfs_mark_buffer_dirty(path->nodes[0]);
			
 
				 	btrfs_free_path(path);
			
 
				 
			
 
				-	disk_super = &root->fs_info->super_copy;
			
 
				+	disk_super = root->fs_info->super_copy;
			
 
				 	features = btrfs_super_incompat_flags(disk_super);
			
 
				 	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
			
 
				 		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
			
@@ -2864,6 +2891,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	int i;
			
 
				+	u64 rel_ptr;
			
 
				+	int size;
			
 
				+	struct btrfs_ioctl_ino_path_args *ipa = NULL;
			
 
				+	struct inode_fs_paths *ipath = NULL;
			
 
				+	struct btrfs_path *path;
			
 
				+
			
 
				+	if (!capable(CAP_SYS_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	ipa = memdup_user(arg, sizeof(*ipa));
			
 
				+	if (IS_ERR(ipa)) {
			
 
				+		ret = PTR_ERR(ipa);
			
 
				+		ipa = NULL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	size = min_t(u32, ipa->size, 4096);
			
 
				+	ipath = init_ipath(size, root, path);
			
 
				+	if (IS_ERR(ipath)) {
			
 
				+		ret = PTR_ERR(ipath);
			
 
				+		ipath = NULL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	ret = paths_from_inode(ipa->inum, ipath);
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
			
 
				+		rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
			
 
				+		ipath->fspath->val[i] = rel_ptr;
			
 
				+	}
			
 
				+
			
 
				+	ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
			
 
				+	if (ret) {
			
 
				+		ret = -EFAULT;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+	free_ipath(ipath);
			
 
				+	kfree(ipa);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
			
 
				+{
			
 
				+	struct btrfs_data_container *inodes = ctx;
			
 
				+	const size_t c = 3 * sizeof(u64);
			
 
				+
			
 
				+	if (inodes->bytes_left >= c) {
			
 
				+		inodes->bytes_left -= c;
			
 
				+		inodes->val[inodes->elem_cnt] = inum;
			
 
				+		inodes->val[inodes->elem_cnt + 1] = offset;
			
 
				+		inodes->val[inodes->elem_cnt + 2] = root;
			
 
				+		inodes->elem_cnt += 3;
			
 
				+	} else {
			
 
				+		inodes->bytes_missing += c - inodes->bytes_left;
			
 
				+		inodes->bytes_left = 0;
			
 
				+		inodes->elem_missed += 3;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
			
 
				+					void __user *arg)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	int size;
			
 
				+	u64 extent_offset;
			
 
				+	struct btrfs_ioctl_logical_ino_args *loi;
			
 
				+	struct btrfs_data_container *inodes = NULL;
			
 
				+	struct btrfs_path *path = NULL;
			
 
				+	struct btrfs_key key;
			
 
				+
			
 
				+	if (!capable(CAP_SYS_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	loi = memdup_user(arg, sizeof(*loi));
			
 
				+	if (IS_ERR(loi)) {
			
 
				+		ret = PTR_ERR(loi);
			
 
				+		loi = NULL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	size = min_t(u32, loi->size, 4096);
			
 
				+	inodes = init_data_container(size);
			
 
				+	if (IS_ERR(inodes)) {
			
 
				+		ret = PTR_ERR(inodes);
			
 
				+		inodes = NULL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
			
 
				+
			
 
				+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
			
 
				+		ret = -ENOENT;
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	extent_offset = loi->logical - key.objectid;
			
 
				+	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
			
 
				+					extent_offset, build_ino_list, inodes);
			
 
				+
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
			
 
				+	if (ret)
			
 
				+		ret = -EFAULT;
			
 
				+
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+	kfree(inodes);
			
 
				+	kfree(loi);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 long btrfs_ioctl(struct file *file, unsigned int
			
 
				 		cmd, unsigned long arg)
			
 
				 {
			
@@ -2921,6 +3086,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 
				 		return btrfs_ioctl_tree_search(file, argp);
			
 
				 	case BTRFS_IOC_INO_LOOKUP:
			
 
				 		return btrfs_ioctl_ino_lookup(file, argp);
			
 
				+	case BTRFS_IOC_INO_PATHS:
			
 
				+		return btrfs_ioctl_ino_to_path(root, argp);
			
 
				+	case BTRFS_IOC_LOGICAL_INO:
			
 
				+		return btrfs_ioctl_logical_to_ino(root, argp);
			
 
				 	case BTRFS_IOC_SPACE_INFO:
			
 
				 		return btrfs_ioctl_space_info(root, argp);
			
 
				 	case BTRFS_IOC_SYNC:
			
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
 
				 	struct btrfs_ioctl_space_info spaces[0];
			
 
				 };
			
 
				 
			
 
				+struct btrfs_data_container {
			
 
				+	__u32	bytes_left;	/* out -- bytes not needed to deliver output */
			
 
				+	__u32	bytes_missing;	/* out -- additional bytes needed for result */
			
 
				+	__u32	elem_cnt;	/* out */
			
 
				+	__u32	elem_missed;	/* out */
			
 
				+	__u64	val[0];		/* out */
			
 
				+};
			
 
				+
			
 
				+struct btrfs_ioctl_ino_path_args {
			
 
				+	__u64				inum;		/* in */
			
 
				+	__u32				size;		/* in */
			
 
				+	__u64				reserved[4];
			
 
				+	/* struct btrfs_data_container	*fspath;	   out */
			
 
				+	__u64				fspath;		/* out */
			
 
				+};
			
 
				+
			
 
				+struct btrfs_ioctl_logical_ino_args {
			
 
				+	__u64				logical;	/* in */
			
 
				+	__u32				size;		/* in */
			
 
				+	__u64				reserved[4];
			
 
				+	/* struct btrfs_data_container	*inodes;	out   */
			
 
				+	__u64				inodes;
			
 
				+};
			
 
				+
			
 
				 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
			
 
				 				   struct btrfs_ioctl_vol_args)
			
 
				 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
			
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
 
				 				 struct btrfs_ioctl_dev_info_args)
			
 
				 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
			
 
				 			       struct btrfs_ioctl_fs_info_args)
			
 
				+#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
			
 
				+					struct btrfs_ioctl_ino_path_args)
			
 
				+#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
			
 
				+					struct btrfs_ioctl_ino_path_args)
			
 
				+
			
 
				 #endif
			
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 
				 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
			
 
				 {
			
 
				 	int i;
			
 
				-	u32 type;
			
 
				-	u32 nr = btrfs_header_nritems(l);
			
 
				+	u32 type, nr;
			
 
				 	struct btrfs_item *item;
			
 
				 	struct btrfs_root_item *ri;
			
 
				 	struct btrfs_dir_item *di;
			
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_key found_key;
			
 
				 
			
 
				+	if (!l)
			
 
				+		return;
			
 
				+
			
 
				+	nr = btrfs_header_nritems(l);
			
 
				+
			
 
				 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
			
 
				 		(unsigned long long)btrfs_header_bytenr(l), nr,
			
 
				 		btrfs_leaf_free_space(root, l));
			
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
 
				+/*
			
 
				+ * Copyright (C) 2011 STRATO.  All rights reserved.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public
			
 
				+ * License v2 as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public
			
 
				+ * License along with this program; if not, write to the
			
 
				+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
			
 
				+ * Boston, MA 021110-1307, USA.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/pagemap.h>
			
 
				+#include <linux/writeback.h>
			
 
				+#include <linux/blkdev.h>
			
 
				+#include <linux/rbtree.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/workqueue.h>
			
 
				+#include "ctree.h"
			
 
				+#include "volumes.h"
			
 
				+#include "disk-io.h"
			
 
				+#include "transaction.h"
			
 
				+
			
 
				+#undef DEBUG
			
 
				+
			
 
				+/*
			
 
				+ * This is the implementation for the generic read ahead framework.
			
 
				+ *
			
 
				+ * To trigger a readahead, btrfs_reada_add must be called. It will start
			
 
				+ * a read ahead for the given range [start, end) on tree root. The returned
			
 
				+ * handle can either be used to wait on the readahead to finish
			
 
				+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
			
 
				+ *
			
 
				+ * The read ahead works as follows:
			
 
				+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
			
 
				+ * reada_start_machine will then search for extents to prefetch and trigger
			
 
				+ * some reads. When a read finishes for a node, all contained node/leaf
			
 
				+ * pointers that lie in the given range will also be enqueued. The reads will
			
 
				+ * be triggered in sequential order, thus giving a big win over a naive
			
 
				+ * enumeration. It will also make use of multi-device layouts. Each disk
			
 
				+ * will have its on read pointer and all disks will by utilized in parallel.
			
 
				+ * Also will no two disks read both sides of a mirror simultaneously, as this
			
 
				+ * would waste seeking capacity. Instead both disks will read different parts
			
 
				+ * of the filesystem.
			
 
				+ * Any number of readaheads can be started in parallel. The read order will be
			
 
				+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
			
 
				+ * than the 2 started one after another.
			
 
				+ */
			
 
				+
			
 
				+#define MAX_MIRRORS 2
			
 
				+#define MAX_IN_FLIGHT 6
			
 
				+
			
 
				+struct reada_extctl {
			
 
				+	struct list_head	list;
			
 
				+	struct reada_control	*rc;
			
 
				+	u64			generation;
			
 
				+};
			
 
				+
			
 
				+struct reada_extent {
			
 
				+	u64			logical;
			
 
				+	struct btrfs_key	top;
			
 
				+	u32			blocksize;
			
 
				+	int			err;
			
 
				+	struct list_head	extctl;
			
 
				+	struct kref		refcnt;
			
 
				+	spinlock_t		lock;
			
 
				+	struct reada_zone	*zones[MAX_MIRRORS];
			
 
				+	int			nzones;
			
 
				+	struct btrfs_device	*scheduled_for;
			
 
				+};
			
 
				+
			
 
				+struct reada_zone {
			
 
				+	u64			start;
			
 
				+	u64			end;
			
 
				+	u64			elems;
			
 
				+	struct list_head	list;
			
 
				+	spinlock_t		lock;
			
 
				+	int			locked;
			
 
				+	struct btrfs_device	*device;
			
 
				+	struct btrfs_device	*devs[MAX_MIRRORS]; /* full list, incl self */
			
 
				+	int			ndevs;
			
 
				+	struct kref		refcnt;
			
 
				+};
			
 
				+
			
 
				+struct reada_machine_work {
			
 
				+	struct btrfs_work	work;
			
 
				+	struct btrfs_fs_info	*fs_info;
			
 
				+};
			
 
				+
			
 
				+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
			
 
				+static void reada_control_release(struct kref *kref);
			
 
				+static void reada_zone_release(struct kref *kref);
			
 
				+static void reada_start_machine(struct btrfs_fs_info *fs_info);
			
 
				+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
			
 
				+
			
 
				+static int reada_add_block(struct reada_control *rc, u64 logical,
			
 
				+			   struct btrfs_key *top, int level, u64 generation);
			
 
				+
			
 
				+/* recurses */
			
 
				+/* in case of err, eb might be NULL */
			
 
				+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				+			    u64 start, int err)
			
 
				+{
			
 
				+	int level = 0;
			
 
				+	int nritems;
			
 
				+	int i;
			
 
				+	u64 bytenr;
			
 
				+	u64 generation;
			
 
				+	struct reada_extent *re;
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct list_head list;
			
 
				+	unsigned long index = start >> PAGE_CACHE_SHIFT;
			
 
				+	struct btrfs_device *for_dev;
			
 
				+
			
 
				+	if (eb)
			
 
				+		level = btrfs_header_level(eb);
			
 
				+
			
 
				+	/* find extent */
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	re = radix_tree_lookup(&fs_info->reada_tree, index);
			
 
				+	if (re)
			
 
				+		kref_get(&re->refcnt);
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	if (!re)
			
 
				+		return -1;
			
 
				+
			
 
				+	spin_lock(&re->lock);
			
 
				+	/*
			
 
				+	 * just take the full list from the extent. afterwards we
			
 
				+	 * don't need the lock anymore
			
 
				+	 */
			
 
				+	list_replace_init(&re->extctl, &list);
			
 
				+	for_dev = re->scheduled_for;
			
 
				+	re->scheduled_for = NULL;
			
 
				+	spin_unlock(&re->lock);
			
 
				+
			
 
				+	if (err == 0) {
			
 
				+		nritems = level ? btrfs_header_nritems(eb) : 0;
			
 
				+		generation = btrfs_header_generation(eb);
			
 
				+		/*
			
 
				+		 * FIXME: currently we just set nritems to 0 if this is a leaf,
			
 
				+		 * effectively ignoring the content. In a next step we could
			
 
				+		 * trigger more readahead depending from the content, e.g.
			
 
				+		 * fetch the checksums for the extents in the leaf.
			
 
				+		 */
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * this is the error case, the extent buffer has not been
			
 
				+		 * read correctly. We won't access anything from it and
			
 
				+		 * just cleanup our data structures. Effectively this will
			
 
				+		 * cut the branch below this node from read ahead.
			
 
				+		 */
			
 
				+		nritems = 0;
			
 
				+		generation = 0;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < nritems; i++) {
			
 
				+		struct reada_extctl *rec;
			
 
				+		u64 n_gen;
			
 
				+		struct btrfs_key key;
			
 
				+		struct btrfs_key next_key;
			
 
				+
			
 
				+		btrfs_node_key_to_cpu(eb, &key, i);
			
 
				+		if (i + 1 < nritems)
			
 
				+			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
			
 
				+		else
			
 
				+			next_key = re->top;
			
 
				+		bytenr = btrfs_node_blockptr(eb, i);
			
 
				+		n_gen = btrfs_node_ptr_generation(eb, i);
			
 
				+
			
 
				+		list_for_each_entry(rec, &list, list) {
			
 
				+			struct reada_control *rc = rec->rc;
			
 
				+
			
 
				+			/*
			
 
				+			 * if the generation doesn't match, just ignore this
			
 
				+			 * extctl. This will probably cut off a branch from
			
 
				+			 * prefetch. Alternatively one could start a new (sub-)
			
 
				+			 * prefetch for this branch, starting again from root.
			
 
				+			 * FIXME: move the generation check out of this loop
			
 
				+			 */
			
 
				+#ifdef DEBUG
			
 
				+			if (rec->generation != generation) {
			
 
				+				printk(KERN_DEBUG "generation mismatch for "
			
 
				+						"(%llu,%d,%llu) %llu != %llu\n",
			
 
				+				       key.objectid, key.type, key.offset,
			
 
				+				       rec->generation, generation);
			
 
				+			}
			
 
				+#endif
			
 
				+			if (rec->generation == generation &&
			
 
				+			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
			
 
				+			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
			
 
				+				reada_add_block(rc, bytenr, &next_key,
			
 
				+						level - 1, n_gen);
			
 
				+		}
			
 
				+	}
			
 
				+	/*
			
 
				+	 * free extctl records
			
 
				+	 */
			
 
				+	while (!list_empty(&list)) {
			
 
				+		struct reada_control *rc;
			
 
				+		struct reada_extctl *rec;
			
 
				+
			
 
				+		rec = list_first_entry(&list, struct reada_extctl, list);
			
 
				+		list_del(&rec->list);
			
 
				+		rc = rec->rc;
			
 
				+		kfree(rec);
			
 
				+
			
 
				+		kref_get(&rc->refcnt);
			
 
				+		if (atomic_dec_and_test(&rc->elems)) {
			
 
				+			kref_put(&rc->refcnt, reada_control_release);
			
 
				+			wake_up(&rc->wait);
			
 
				+		}
			
 
				+		kref_put(&rc->refcnt, reada_control_release);
			
 
				+
			
 
				+		reada_extent_put(fs_info, re);	/* one ref for each entry */
			
 
				+	}
			
 
				+	reada_extent_put(fs_info, re);	/* our ref */
			
 
				+	if (for_dev)
			
 
				+		atomic_dec(&for_dev->reada_in_flight);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * start is passed separately in case eb in NULL, which may be the case with
			
 
				+ * failed I/O
			
 
				+ */
			
 
				+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				+			 u64 start, int err)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = __readahead_hook(root, eb, start, err);
			
 
				+
			
 
				+	reada_start_machine(root->fs_info);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
			
 
				+					  struct btrfs_device *dev, u64 logical,
			
 
				+					  struct btrfs_bio *bbio)
			
 
				+{
			
 
				+	int ret;
			
 
				+	int looped = 0;
			
 
				+	struct reada_zone *zone;
			
 
				+	struct btrfs_block_group_cache *cache = NULL;
			
 
				+	u64 start;
			
 
				+	u64 end;
			
 
				+	int i;
			
 
				+
			
 
				+again:
			
 
				+	zone = NULL;
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
			
 
				+				     logical >> PAGE_CACHE_SHIFT, 1);
			
 
				+	if (ret == 1)
			
 
				+		kref_get(&zone->refcnt);
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	if (ret == 1) {
			
 
				+		if (logical >= zone->start && logical < zone->end)
			
 
				+			return zone;
			
 
				+		spin_lock(&fs_info->reada_lock);
			
 
				+		kref_put(&zone->refcnt, reada_zone_release);
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+	}
			
 
				+
			
 
				+	if (looped)
			
 
				+		return NULL;
			
 
				+
			
 
				+	cache = btrfs_lookup_block_group(fs_info, logical);
			
 
				+	if (!cache)
			
 
				+		return NULL;
			
 
				+
			
 
				+	start = cache->key.objectid;
			
 
				+	end = start + cache->key.offset - 1;
			
 
				+	btrfs_put_block_group(cache);
			
 
				+
			
 
				+	zone = kzalloc(sizeof(*zone), GFP_NOFS);
			
 
				+	if (!zone)
			
 
				+		return NULL;
			
 
				+
			
 
				+	zone->start = start;
			
 
				+	zone->end = end;
			
 
				+	INIT_LIST_HEAD(&zone->list);
			
 
				+	spin_lock_init(&zone->lock);
			
 
				+	zone->locked = 0;
			
 
				+	kref_init(&zone->refcnt);
			
 
				+	zone->elems = 0;
			
 
				+	zone->device = dev; /* our device always sits at index 0 */
			
 
				+	for (i = 0; i < bbio->num_stripes; ++i) {
			
 
				+		/* bounds have already been checked */
			
 
				+		zone->devs[i] = bbio->stripes[i].dev;
			
 
				+	}
			
 
				+	zone->ndevs = bbio->num_stripes;
			
 
				+
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	ret = radix_tree_insert(&dev->reada_zones,
			
 
				+				(unsigned long)zone->end >> PAGE_CACHE_SHIFT,
			
 
				+				zone);
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	if (ret) {
			
 
				+		kfree(zone);
			
 
				+		looped = 1;
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	return zone;
			
 
				+}
			
 
				+
			
 
				+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
			
 
				+					      u64 logical,
			
 
				+					      struct btrfs_key *top, int level)
			
 
				+{
			
 
				+	int ret;
			
 
				+	int looped = 0;
			
 
				+	struct reada_extent *re = NULL;
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				+	struct btrfs_device *dev;
			
 
				+	u32 blocksize;
			
 
				+	u64 length;
			
 
				+	int nzones = 0;
			
 
				+	int i;
			
 
				+	unsigned long index = logical >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+again:
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	re = radix_tree_lookup(&fs_info->reada_tree, index);
			
 
				+	if (re)
			
 
				+		kref_get(&re->refcnt);
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	if (re || looped)
			
 
				+		return re;
			
 
				+
			
 
				+	re = kzalloc(sizeof(*re), GFP_NOFS);
			
 
				+	if (!re)
			
 
				+		return NULL;
			
 
				+
			
 
				+	blocksize = btrfs_level_size(root, level);
			
 
				+	re->logical = logical;
			
 
				+	re->blocksize = blocksize;
			
 
				+	re->top = *top;
			
 
				+	INIT_LIST_HEAD(&re->extctl);
			
 
				+	spin_lock_init(&re->lock);
			
 
				+	kref_init(&re->refcnt);
			
 
				+
			
 
				+	/*
			
 
				+	 * map block
			
 
				+	 */
			
 
				+	length = blocksize;
			
 
				+	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
			
 
				+	if (ret || !bbio || length < blocksize)
			
 
				+		goto error;
			
 
				+
			
 
				+	if (bbio->num_stripes > MAX_MIRRORS) {
			
 
				+		printk(KERN_ERR "btrfs readahead: more than %d copies not "
			
 
				+				"supported", MAX_MIRRORS);
			
 
				+		goto error;
			
 
				+	}
			
 
				+
			
 
				+	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
			
 
				+		struct reada_zone *zone;
			
 
				+
			
 
				+		dev = bbio->stripes[nzones].dev;
			
 
				+		zone = reada_find_zone(fs_info, dev, logical, bbio);
			
 
				+		if (!zone)
			
 
				+			break;
			
 
				+
			
 
				+		re->zones[nzones] = zone;
			
 
				+		spin_lock(&zone->lock);
			
 
				+		if (!zone->elems)
			
 
				+			kref_get(&zone->refcnt);
			
 
				+		++zone->elems;
			
 
				+		spin_unlock(&zone->lock);
			
 
				+		spin_lock(&fs_info->reada_lock);
			
 
				+		kref_put(&zone->refcnt, reada_zone_release);
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+	}
			
 
				+	re->nzones = nzones;
			
 
				+	if (nzones == 0) {
			
 
				+		/* not a single zone found, error and out */
			
 
				+		goto error;
			
 
				+	}
			
 
				+
			
 
				+	/* insert extent in reada_tree + all per-device trees, all or nothing */
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
			
 
				+	if (ret) {
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+		if (ret != -ENOMEM) {
			
 
				+			/* someone inserted the extent in the meantime */
			
 
				+			looped = 1;
			
 
				+		}
			
 
				+		goto error;
			
 
				+	}
			
 
				+	for (i = 0; i < nzones; ++i) {
			
 
				+		dev = bbio->stripes[i].dev;
			
 
				+		ret = radix_tree_insert(&dev->reada_extents, index, re);
			
 
				+		if (ret) {
			
 
				+			while (--i >= 0) {
			
 
				+				dev = bbio->stripes[i].dev;
			
 
				+				BUG_ON(dev == NULL);
			
 
				+				radix_tree_delete(&dev->reada_extents, index);
			
 
				+			}
			
 
				+			BUG_ON(fs_info == NULL);
			
 
				+			radix_tree_delete(&fs_info->reada_tree, index);
			
 
				+			spin_unlock(&fs_info->reada_lock);
			
 
				+			goto error;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	kfree(bbio);
			
 
				+	return re;
			
 
				+
			
 
				+error:
			
 
				+	while (nzones) {
			
 
				+		struct reada_zone *zone;
			
 
				+
			
 
				+		--nzones;
			
 
				+		zone = re->zones[nzones];
			
 
				+		kref_get(&zone->refcnt);
			
 
				+		spin_lock(&zone->lock);
			
 
				+		--zone->elems;
			
 
				+		if (zone->elems == 0) {
			
 
				+			/*
			
 
				+			 * no fs_info->reada_lock needed, as this can't be
			
 
				+			 * the last ref
			
 
				+			 */
			
 
				+			kref_put(&zone->refcnt, reada_zone_release);
			
 
				+		}
			
 
				+		spin_unlock(&zone->lock);
			
 
				+
			
 
				+		spin_lock(&fs_info->reada_lock);
			
 
				+		kref_put(&zone->refcnt, reada_zone_release);
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+	}
			
 
				+	kfree(bbio);
			
 
				+	kfree(re);
			
 
				+	if (looped)
			
 
				+		goto again;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void reada_kref_dummy(struct kref *kr)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static void reada_extent_put(struct btrfs_fs_info *fs_info,
			
 
				+			     struct reada_extent *re)
			
 
				+{
			
 
				+	int i;
			
 
				+	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	if (!kref_put(&re->refcnt, reada_kref_dummy)) {
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	radix_tree_delete(&fs_info->reada_tree, index);
			
 
				+	for (i = 0; i < re->nzones; ++i) {
			
 
				+		struct reada_zone *zone = re->zones[i];
			
 
				+
			
 
				+		radix_tree_delete(&zone->device->reada_extents, index);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	for (i = 0; i < re->nzones; ++i) {
			
 
				+		struct reada_zone *zone = re->zones[i];
			
 
				+
			
 
				+		kref_get(&zone->refcnt);
			
 
				+		spin_lock(&zone->lock);
			
 
				+		--zone->elems;
			
 
				+		if (zone->elems == 0) {
			
 
				+			/* no fs_info->reada_lock needed, as this can't be
			
 
				+			 * the last ref */
			
 
				+			kref_put(&zone->refcnt, reada_zone_release);
			
 
				+		}
			
 
				+		spin_unlock(&zone->lock);
			
 
				+
			
 
				+		spin_lock(&fs_info->reada_lock);
			
 
				+		kref_put(&zone->refcnt, reada_zone_release);
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+	}
			
 
				+	if (re->scheduled_for)
			
 
				+		atomic_dec(&re->scheduled_for->reada_in_flight);
			
 
				+
			
 
				+	kfree(re);
			
 
				+}
			
 
				+
			
 
				+static void reada_zone_release(struct kref *kref)
			
 
				+{
			
 
				+	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
			
 
				+
			
 
				+	radix_tree_delete(&zone->device->reada_zones,
			
 
				+			  zone->end >> PAGE_CACHE_SHIFT);
			
 
				+
			
 
				+	kfree(zone);
			
 
				+}
			
 
				+
			
 
				+static void reada_control_release(struct kref *kref)
			
 
				+{
			
 
				+	struct reada_control *rc = container_of(kref, struct reada_control,
			
 
				+						refcnt);
			
 
				+
			
 
				+	kfree(rc);
			
 
				+}
			
 
				+
			
 
				+static int reada_add_block(struct reada_control *rc, u64 logical,
			
 
				+			   struct btrfs_key *top, int level, u64 generation)
			
 
				+{
			
 
				+	struct btrfs_root *root = rc->root;
			
 
				+	struct reada_extent *re;
			
 
				+	struct reada_extctl *rec;
			
 
				+
			
 
				+	re = reada_find_extent(root, logical, top, level); /* takes one ref */
			
 
				+	if (!re)
			
 
				+		return -1;
			
 
				+
			
 
				+	rec = kzalloc(sizeof(*rec), GFP_NOFS);
			
 
				+	if (!rec) {
			
 
				+		reada_extent_put(root->fs_info, re);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	rec->rc = rc;
			
 
				+	rec->generation = generation;
			
 
				+	atomic_inc(&rc->elems);
			
 
				+
			
 
				+	spin_lock(&re->lock);
			
 
				+	list_add_tail(&rec->list, &re->extctl);
			
 
				+	spin_unlock(&re->lock);
			
 
				+
			
 
				+	/* leave the ref on the extent */
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called with fs_info->reada_lock held
			
 
				+ */
			
 
				+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
			
 
				+{
			
 
				+	int i;
			
 
				+	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+	for (i = 0; i < zone->ndevs; ++i) {
			
 
				+		struct reada_zone *peer;
			
 
				+		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
			
 
				+		if (peer && peer->device != zone->device)
			
 
				+			peer->locked = lock;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called with fs_info->reada_lock held
			
 
				+ */
			
 
				+static int reada_pick_zone(struct btrfs_device *dev)
			
 
				+{
			
 
				+	struct reada_zone *top_zone = NULL;
			
 
				+	struct reada_zone *top_locked_zone = NULL;
			
 
				+	u64 top_elems = 0;
			
 
				+	u64 top_locked_elems = 0;
			
 
				+	unsigned long index = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (dev->reada_curr_zone) {
			
 
				+		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
			
 
				+		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
			
 
				+		dev->reada_curr_zone = NULL;
			
 
				+	}
			
 
				+	/* pick the zone with the most elements */
			
 
				+	while (1) {
			
 
				+		struct reada_zone *zone;
			
 
				+
			
 
				+		ret = radix_tree_gang_lookup(&dev->reada_zones,
			
 
				+					     (void **)&zone, index, 1);
			
 
				+		if (ret == 0)
			
 
				+			break;
			
 
				+		index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
			
 
				+		if (zone->locked) {
			
 
				+			if (zone->elems > top_locked_elems) {
			
 
				+				top_locked_elems = zone->elems;
			
 
				+				top_locked_zone = zone;
			
 
				+			}
			
 
				+		} else {
			
 
				+			if (zone->elems > top_elems) {
			
 
				+				top_elems = zone->elems;
			
 
				+				top_zone = zone;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (top_zone)
			
 
				+		dev->reada_curr_zone = top_zone;
			
 
				+	else if (top_locked_zone)
			
 
				+		dev->reada_curr_zone = top_locked_zone;
			
 
				+	else
			
 
				+		return 0;
			
 
				+
			
 
				+	dev->reada_next = dev->reada_curr_zone->start;
			
 
				+	kref_get(&dev->reada_curr_zone->refcnt);
			
 
				+	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
			
 
				+				   struct btrfs_device *dev)
			
 
				+{
			
 
				+	struct reada_extent *re = NULL;
			
 
				+	int mirror_num = 0;
			
 
				+	struct extent_buffer *eb = NULL;
			
 
				+	u64 logical;
			
 
				+	u32 blocksize;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				+	int need_kick = 0;
			
 
				+
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	if (dev->reada_curr_zone == NULL) {
			
 
				+		ret = reada_pick_zone(dev);
			
 
				+		if (!ret) {
			
 
				+			spin_unlock(&fs_info->reada_lock);
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+	/*
			
 
				+	 * FIXME currently we issue the reads one extent at a time. If we have
			
 
				+	 * a contiguous block of extents, we could also coagulate them or use
			
 
				+	 * plugging to speed things up
			
 
				+	 */
			
 
				+	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
			
 
				+				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
			
 
				+	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
			
 
				+		ret = reada_pick_zone(dev);
			
 
				+		if (!ret) {
			
 
				+			spin_unlock(&fs_info->reada_lock);
			
 
				+			return 0;
			
 
				+		}
			
 
				+		re = NULL;
			
 
				+		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
			
 
				+					dev->reada_next >> PAGE_CACHE_SHIFT, 1);
			
 
				+	}
			
 
				+	if (ret == 0) {
			
 
				+		spin_unlock(&fs_info->reada_lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	dev->reada_next = re->logical + re->blocksize;
			
 
				+	kref_get(&re->refcnt);
			
 
				+
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * find mirror num
			
 
				+	 */
			
 
				+	for (i = 0; i < re->nzones; ++i) {
			
 
				+		if (re->zones[i]->device == dev) {
			
 
				+			mirror_num = i + 1;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	logical = re->logical;
			
 
				+	blocksize = re->blocksize;
			
 
				+
			
 
				+	spin_lock(&re->lock);
			
 
				+	if (re->scheduled_for == NULL) {
			
 
				+		re->scheduled_for = dev;
			
 
				+		need_kick = 1;
			
 
				+	}
			
 
				+	spin_unlock(&re->lock);
			
 
				+
			
 
				+	reada_extent_put(fs_info, re);
			
 
				+
			
 
				+	if (!need_kick)
			
 
				+		return 0;
			
 
				+
			
 
				+	atomic_inc(&dev->reada_in_flight);
			
 
				+	ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
			
 
				+			 mirror_num, &eb);
			
 
				+	if (ret)
			
 
				+		__readahead_hook(fs_info->extent_root, NULL, logical, ret);
			
 
				+	else if (eb)
			
 
				+		__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
			
 
				+
			
 
				+	if (eb)
			
 
				+		free_extent_buffer(eb);
			
 
				+
			
 
				+	return 1;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void reada_start_machine_worker(struct btrfs_work *work)
			
 
				+{
			
 
				+	struct reada_machine_work *rmw;
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				+
			
 
				+	rmw = container_of(work, struct reada_machine_work, work);
			
 
				+	fs_info = rmw->fs_info;
			
 
				+
			
 
				+	kfree(rmw);
			
 
				+
			
 
				+	__reada_start_machine(fs_info);
			
 
				+}
			
 
				+
			
 
				+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_device *device;
			
 
				+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
			
 
				+	u64 enqueued;
			
 
				+	u64 total = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	do {
			
 
				+		enqueued = 0;
			
 
				+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
			
 
				+			if (atomic_read(&device->reada_in_flight) <
			
 
				+			    MAX_IN_FLIGHT)
			
 
				+				enqueued += reada_start_machine_dev(fs_info,
			
 
				+								    device);
			
 
				+		}
			
 
				+		total += enqueued;
			
 
				+	} while (enqueued && total < 10000);
			
 
				+
			
 
				+	if (enqueued == 0)
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * If everything is already in the cache, this is effectively single
			
 
				+	 * threaded. To a) not hold the caller for too long and b) to utilize
			
 
				+	 * more cores, we broke the loop above after 10000 iterations and now
			
 
				+	 * enqueue to workers to finish it. This will distribute the load to
			
 
				+	 * the cores.
			
 
				+	 */
			
 
				+	for (i = 0; i < 2; ++i)
			
 
				+		reada_start_machine(fs_info);
			
 
				+}
			
 
				+
			
 
				+static void reada_start_machine(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct reada_machine_work *rmw;
			
 
				+
			
 
				+	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
			
 
				+	if (!rmw) {
			
 
				+		/* FIXME we cannot handle this properly right now */
			
 
				+		BUG();
			
 
				+	}
			
 
				+	rmw->work.func = reada_start_machine_worker;
			
 
				+	rmw->fs_info = fs_info;
			
 
				+
			
 
				+	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
			
 
				+}
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
			
 
				+{
			
 
				+	struct btrfs_device *device;
			
 
				+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
			
 
				+	unsigned long index;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				+	int j;
			
 
				+	int cnt;
			
 
				+
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
			
 
				+		printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
			
 
				+			atomic_read(&device->reada_in_flight));
			
 
				+		index = 0;
			
 
				+		while (1) {
			
 
				+			struct reada_zone *zone;
			
 
				+			ret = radix_tree_gang_lookup(&device->reada_zones,
			
 
				+						     (void **)&zone, index, 1);
			
 
				+			if (ret == 0)
			
 
				+				break;
			
 
				+			printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
			
 
				+				"%d devs", zone->start, zone->end, zone->elems,
			
 
				+				zone->locked);
			
 
				+			for (j = 0; j < zone->ndevs; ++j) {
			
 
				+				printk(KERN_CONT " %lld",
			
 
				+					zone->devs[j]->devid);
			
 
				+			}
			
 
				+			if (device->reada_curr_zone == zone)
			
 
				+				printk(KERN_CONT " curr off %llu",
			
 
				+					device->reada_next - zone->start);
			
 
				+			printk(KERN_CONT "\n");
			
 
				+			index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
			
 
				+		}
			
 
				+		cnt = 0;
			
 
				+		index = 0;
			
 
				+		while (all) {
			
 
				+			struct reada_extent *re = NULL;
			
 
				+
			
 
				+			ret = radix_tree_gang_lookup(&device->reada_extents,
			
 
				+						     (void **)&re, index, 1);
			
 
				+			if (ret == 0)
			
 
				+				break;
			
 
				+			printk(KERN_DEBUG
			
 
				+				"  re: logical %llu size %u empty %d for %lld",
			
 
				+				re->logical, re->blocksize,
			
 
				+				list_empty(&re->extctl), re->scheduled_for ?
			
 
				+				re->scheduled_for->devid : -1);
			
 
				+
			
 
				+			for (i = 0; i < re->nzones; ++i) {
			
 
				+				printk(KERN_CONT " zone %llu-%llu devs",
			
 
				+					re->zones[i]->start,
			
 
				+					re->zones[i]->end);
			
 
				+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
			
 
				+					printk(KERN_CONT " %lld",
			
 
				+						re->zones[i]->devs[j]->devid);
			
 
				+				}
			
 
				+			}
			
 
				+			printk(KERN_CONT "\n");
			
 
				+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
			
 
				+			if (++cnt > 15)
			
 
				+				break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	index = 0;
			
 
				+	cnt = 0;
			
 
				+	while (all) {
			
 
				+		struct reada_extent *re = NULL;
			
 
				+
			
 
				+		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
			
 
				+					     index, 1);
			
 
				+		if (ret == 0)
			
 
				+			break;
			
 
				+		if (!re->scheduled_for) {
			
 
				+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
			
 
				+			continue;
			
 
				+		}
			
 
				+		printk(KERN_DEBUG
			
 
				+			"re: logical %llu size %u list empty %d for %lld",
			
 
				+			re->logical, re->blocksize, list_empty(&re->extctl),
			
 
				+			re->scheduled_for ? re->scheduled_for->devid : -1);
			
 
				+		for (i = 0; i < re->nzones; ++i) {
			
 
				+			printk(KERN_CONT " zone %llu-%llu devs",
			
 
				+				re->zones[i]->start,
			
 
				+				re->zones[i]->end);
			
 
				+			for (i = 0; i < re->nzones; ++i) {
			
 
				+				printk(KERN_CONT " zone %llu-%llu devs",
			
 
				+					re->zones[i]->start,
			
 
				+					re->zones[i]->end);
			
 
				+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
			
 
				+					printk(KERN_CONT " %lld",
			
 
				+						re->zones[i]->devs[j]->devid);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		printk(KERN_CONT "\n");
			
 
				+		index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
			
 
				+	}
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * interface
			
 
				+ */
			
 
				+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
			
 
				+			struct btrfs_key *key_start, struct btrfs_key *key_end)
			
 
				+{
			
 
				+	struct reada_control *rc;
			
 
				+	u64 start;
			
 
				+	u64 generation;
			
 
				+	int level;
			
 
				+	struct extent_buffer *node;
			
 
				+	static struct btrfs_key max_key = {
			
 
				+		.objectid = (u64)-1,
			
 
				+		.type = (u8)-1,
			
 
				+		.offset = (u64)-1
			
 
				+	};
			
 
				+
			
 
				+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
			
 
				+	if (!rc)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	rc->root = root;
			
 
				+	rc->key_start = *key_start;
			
 
				+	rc->key_end = *key_end;
			
 
				+	atomic_set(&rc->elems, 0);
			
 
				+	init_waitqueue_head(&rc->wait);
			
 
				+	kref_init(&rc->refcnt);
			
 
				+	kref_get(&rc->refcnt); /* one ref for having elements */
			
 
				+
			
 
				+	node = btrfs_root_node(root);
			
 
				+	start = node->start;
			
 
				+	level = btrfs_header_level(node);
			
 
				+	generation = btrfs_header_generation(node);
			
 
				+	free_extent_buffer(node);
			
 
				+
			
 
				+	reada_add_block(rc, start, &max_key, level, generation);
			
 
				+
			
 
				+	reada_start_machine(root->fs_info);
			
 
				+
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+int btrfs_reada_wait(void *handle)
			
 
				+{
			
 
				+	struct reada_control *rc = handle;
			
 
				+
			
 
				+	while (atomic_read(&rc->elems)) {
			
 
				+		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
			
 
				+				   5 * HZ);
			
 
				+		dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
			
 
				+	}
			
 
				+
			
 
				+	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
			
 
				+
			
 
				+	kref_put(&rc->refcnt, reada_control_release);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#else
			
 
				+int btrfs_reada_wait(void *handle)
			
 
				+{
			
 
				+	struct reada_control *rc = handle;
			
 
				+
			
 
				+	while (atomic_read(&rc->elems)) {
			
 
				+		wait_event(rc->wait, atomic_read(&rc->elems) == 0);
			
 
				+	}
			
 
				+
			
 
				+	kref_put(&rc->refcnt, reada_control_release);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void btrfs_reada_detach(void *handle)
			
 
				+{
			
 
				+	struct reada_control *rc = handle;
			
 
				+
			
 
				+	kref_put(&rc->refcnt, reada_control_release);
			
 
				+}
			
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
				 		BUG_ON(IS_ERR(trans));
			
 
				 		trans->block_rsv = rc->block_rsv;
			
 
				 
			
 
				-		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
			
 
				-					    min_reserved, 0);
			
 
				+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
			
 
				 		if (ret) {
			
 
				 			BUG_ON(ret != -EAGAIN);
			
 
				 			ret = btrfs_commit_transaction(trans, root);
			
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 
				 again:
			
 
				 	if (!err) {
			
 
				 		num_bytes = rc->merging_rsv_size;
			
 
				-		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
			
 
				-					  num_bytes);
			
 
				+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
			
 
				 		if (ret)
			
 
				 			err = ret;
			
 
				 	}
			
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 
				 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
			
 
				 
			
 
				 	trans->block_rsv = rc->block_rsv;
			
 
				-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
			
 
				+	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
			
 
				 	if (ret) {
			
 
				 		if (ret == -EAGAIN)
			
 
				 			rc->commit_transaction = 1;
			
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 
				 	unsigned long last_index;
			
 
				 	struct page *page;
			
 
				 	struct file_ra_state *ra;
			
 
				+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
			
 
				 	int nr = 0;
			
 
				 	int ret = 0;
			
 
				 
			
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 
				 						  ra, NULL, index,
			
 
				 						  last_index + 1 - index);
			
 
				 			page = find_or_create_page(inode->i_mapping, index,
			
 
				-						   GFP_NOFS);
			
 
				+						   mask);
			
 
				 			if (!page) {
			
 
				 				btrfs_delalloc_release_metadata(inode,
			
 
				 							PAGE_CACHE_SIZE);
			
@@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
 
				 	}
			
 
				 
			
 
				 	key.objectid = ref_objectid;
			
 
				-	key.offset = ref_offset;
			
 
				 	key.type = BTRFS_EXTENT_DATA_KEY;
			
 
				+	if (ref_offset > ((u64)-1 << 32))
			
 
				+		key.offset = 0;
			
 
				+	else
			
 
				+		key.offset = ref_offset;
			
 
				 
			
 
				 	path->search_commit_root = 1;
			
 
				 	path->skip_locking = 1;
			
@@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)
 
				 	 * btrfs_init_reloc_root will use them when there
			
 
				 	 * is no reservation in transaction handle.
			
 
				 	 */
			
 
				-	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
			
 
				+	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
			
 
				 				  rc->extent_root->nodesize * 256);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				-	rc->block_rsv->refill_used = 1;
			
 
				-	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
			
 
				-
			
 
				 	memset(&rc->cluster, 0, sizeof(rc->cluster));
			
 
				 	rc->search_start = rc->block_group->key.objectid;
			
 
				 	rc->extents_found = 0;
			
@@ -3777,8 +3776,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		ret = btrfs_block_rsv_check(trans, rc->extent_root,
			
 
				-					    rc->block_rsv, 0, 5);
			
 
				+		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
			
 
				 		if (ret < 0) {
			
 
				 			if (ret != -EAGAIN) {
			
 
				 				err = ret;
			
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
 
				  */
			
 
				 
			
 
				 #include <linux/blkdev.h>
			
 
				+#include <linux/ratelimit.h>
			
 
				 #include "ctree.h"
			
 
				 #include "volumes.h"
			
 
				 #include "disk-io.h"
			
 
				 #include "ordered-data.h"
			
 
				+#include "transaction.h"
			
 
				+#include "backref.h"
			
 
				+#include "extent_io.h"
			
 
				 
			
 
				 /*
			
 
				  * This is only the first step towards a full-features scrub. It reads all
			
@@ -29,15 +33,12 @@
 
				  * any can be found.
			
 
				  *
			
 
				  * Future enhancements:
			
 
				- *  - To enhance the performance, better read-ahead strategies for the
			
 
				- *    extent-tree can be employed.
			
 
				  *  - In case an unrepairable extent is encountered, track which files are
			
 
				  *    affected and report them
			
 
				  *  - In case of a read error on files with nodatasum, map the file and read
			
 
				  *    the extent to trigger a writeback of the good copy
			
 
				  *  - track and record media errors, throw out bad devices
			
 
				  *  - add a mode to also read unallocated space
			
 
				- *  - make the prefetch cancellable
			
 
				  */
			
 
				 
			
 
				 struct scrub_bio;
			
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
 
				 struct scrub_page {
			
 
				 	u64			flags;  /* extent flags */
			
 
				 	u64			generation;
			
 
				-	u64			mirror_num;
			
 
				+	int			mirror_num;
			
 
				 	int			have_csum;
			
 
				 	u8			csum[BTRFS_CSUM_SIZE];
			
 
				 };
			
@@ -87,6 +88,7 @@ struct scrub_dev {
 
				 	int			first_free;
			
 
				 	int			curr;
			
 
				 	atomic_t		in_flight;
			
 
				+	atomic_t		fixup_cnt;
			
 
				 	spinlock_t		list_lock;
			
 
				 	wait_queue_head_t	list_wait;
			
 
				 	u16			csum_size;
			
@@ -100,6 +102,27 @@ struct scrub_dev {
 
				 	spinlock_t		stat_lock;
			
 
				 };
			
 
				 
			
 
				+struct scrub_fixup_nodatasum {
			
 
				+	struct scrub_dev	*sdev;
			
 
				+	u64			logical;
			
 
				+	struct btrfs_root	*root;
			
 
				+	struct btrfs_work	work;
			
 
				+	int			mirror_num;
			
 
				+};
			
 
				+
			
 
				+struct scrub_warning {
			
 
				+	struct btrfs_path	*path;
			
 
				+	u64			extent_item_size;
			
 
				+	char			*scratch_buf;
			
 
				+	char			*msg_buf;
			
 
				+	const char		*errstr;
			
 
				+	sector_t		sector;
			
 
				+	u64			logical;
			
 
				+	struct btrfs_device	*dev;
			
 
				+	int			msg_bufsize;
			
 
				+	int			scratch_bufsize;
			
 
				+};
			
 
				+
			
 
				 static void scrub_free_csums(struct scrub_dev *sdev)
			
 
				 {
			
 
				 	while (!list_empty(&sdev->csum_list)) {
			
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 
				 
			
 
				 		if (i != SCRUB_BIOS_PER_DEV-1)
			
 
				 			sdev->bios[i]->next_free = i + 1;
			
 
				-		 else
			
 
				+		else
			
 
				 			sdev->bios[i]->next_free = -1;
			
 
				 	}
			
 
				 	sdev->first_free = 0;
			
 
				 	sdev->curr = -1;
			
 
				 	atomic_set(&sdev->in_flight, 0);
			
 
				+	atomic_set(&sdev->fixup_cnt, 0);
			
 
				 	atomic_set(&sdev->cancel_req, 0);
			
 
				-	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
			
 
				+	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
			
 
				 	INIT_LIST_HEAD(&sdev->csum_list);
			
 
				 
			
 
				 	spin_lock_init(&sdev->list_lock);
			
@@ -195,24 +219,361 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 
				 	return ERR_PTR(-ENOMEM);
			
 
				 }
			
 
				 
			
 
				+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
			
 
				+{
			
 
				+	u64 isize;
			
 
				+	u32 nlink;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_inode_item *inode_item;
			
 
				+	struct scrub_warning *swarn = ctx;
			
 
				+	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
			
 
				+	struct inode_fs_paths *ipath = NULL;
			
 
				+	struct btrfs_root *local_root;
			
 
				+	struct btrfs_key root_key;
			
 
				+
			
 
				+	root_key.objectid = root;
			
 
				+	root_key.type = BTRFS_ROOT_ITEM_KEY;
			
 
				+	root_key.offset = (u64)-1;
			
 
				+	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
			
 
				+	if (IS_ERR(local_root)) {
			
 
				+		ret = PTR_ERR(local_root);
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	ret = inode_item_info(inum, 0, local_root, swarn->path);
			
 
				+	if (ret) {
			
 
				+		btrfs_release_path(swarn->path);
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	eb = swarn->path->nodes[0];
			
 
				+	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
			
 
				+					struct btrfs_inode_item);
			
 
				+	isize = btrfs_inode_size(eb, inode_item);
			
 
				+	nlink = btrfs_inode_nlink(eb, inode_item);
			
 
				+	btrfs_release_path(swarn->path);
			
 
				+
			
 
				+	ipath = init_ipath(4096, local_root, swarn->path);
			
 
				+	ret = paths_from_inode(inum, ipath);
			
 
				+
			
 
				+	if (ret < 0)
			
 
				+		goto err;
			
 
				+
			
 
				+	/*
			
 
				+	 * we deliberately ignore the bit ipath might have been too small to
			
 
				+	 * hold all of the paths here
			
 
				+	 */
			
 
				+	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
			
 
				+		printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
			
 
				+			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
			
 
				+			"length %llu, links %u (path: %s)\n", swarn->errstr,
			
 
				+			swarn->logical, swarn->dev->name,
			
 
				+			(unsigned long long)swarn->sector, root, inum, offset,
			
 
				+			min(isize - offset, (u64)PAGE_SIZE), nlink,
			
 
				+			(char *)ipath->fspath->val[i]);
			
 
				+
			
 
				+	free_ipath(ipath);
			
 
				+	return 0;
			
 
				+
			
 
				+err:
			
 
				+	printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
			
 
				+		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
			
 
				+		"resolving failed with ret=%d\n", swarn->errstr,
			
 
				+		swarn->logical, swarn->dev->name,
			
 
				+		(unsigned long long)swarn->sector, root, inum, offset, ret);
			
 
				+
			
 
				+	free_ipath(ipath);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
			
 
				+				int ix)
			
 
				+{
			
 
				+	struct btrfs_device *dev = sbio->sdev->dev;
			
 
				+	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_key found_key;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_extent_item *ei;
			
 
				+	struct scrub_warning swarn;
			
 
				+	u32 item_size;
			
 
				+	int ret;
			
 
				+	u64 ref_root;
			
 
				+	u8 ref_level;
			
 
				+	unsigned long ptr = 0;
			
 
				+	const int bufsize = 4096;
			
 
				+	u64 extent_offset;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+
			
 
				+	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
			
 
				+	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
			
 
				+	swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
			
 
				+	swarn.logical = sbio->logical + ix * PAGE_SIZE;
			
 
				+	swarn.errstr = errstr;
			
 
				+	swarn.dev = dev;
			
 
				+	swarn.msg_bufsize = bufsize;
			
 
				+	swarn.scratch_bufsize = bufsize;
			
 
				+
			
 
				+	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
			
 
				+		goto out;
			
 
				+
			
 
				+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	extent_offset = swarn.logical - found_key.objectid;
			
 
				+	swarn.extent_item_size = found_key.offset;
			
 
				+
			
 
				+	eb = path->nodes[0];
			
 
				+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
			
 
				+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
			
 
				+
			
 
				+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			
 
				+		do {
			
 
				+			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
			
 
				+							&ref_root, &ref_level);
			
 
				+			printk(KERN_WARNING "%s at logical %llu on dev %s, "
			
 
				+				"sector %llu: metadata %s (level %d) in tree "
			
 
				+				"%llu\n", errstr, swarn.logical, dev->name,
			
 
				+				(unsigned long long)swarn.sector,
			
 
				+				ref_level ? "node" : "leaf",
			
 
				+				ret < 0 ? -1 : ref_level,
			
 
				+				ret < 0 ? -1 : ref_root);
			
 
				+		} while (ret != 1);
			
 
				+	} else {
			
 
				+		swarn.path = path;
			
 
				+		iterate_extent_inodes(fs_info, path, found_key.objectid,
			
 
				+					extent_offset,
			
 
				+					scrub_print_warning_inode, &swarn);
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+	kfree(swarn.scratch_buf);
			
 
				+	kfree(swarn.msg_buf);
			
 
				+}
			
 
				+
			
 
				+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
			
 
				+{
			
 
				+	struct page *page = NULL;
			
 
				+	unsigned long index;
			
 
				+	struct scrub_fixup_nodatasum *fixup = ctx;
			
 
				+	int ret;
			
 
				+	int corrected = 0;
			
 
				+	struct btrfs_key key;
			
 
				+	struct inode *inode = NULL;
			
 
				+	u64 end = offset + PAGE_SIZE - 1;
			
 
				+	struct btrfs_root *local_root;
			
 
				+
			
 
				+	key.objectid = root;
			
 
				+	key.type = BTRFS_ROOT_ITEM_KEY;
			
 
				+	key.offset = (u64)-1;
			
 
				+	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
			
 
				+	if (IS_ERR(local_root))
			
 
				+		return PTR_ERR(local_root);
			
 
				+
			
 
				+	key.type = BTRFS_INODE_ITEM_KEY;
			
 
				+	key.objectid = inum;
			
 
				+	key.offset = 0;
			
 
				+	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
			
 
				+	if (IS_ERR(inode))
			
 
				+		return PTR_ERR(inode);
			
 
				+
			
 
				+	index = offset >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
			
 
				+	if (!page) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (PageUptodate(page)) {
			
 
				+		struct btrfs_mapping_tree *map_tree;
			
 
				+		if (PageDirty(page)) {
			
 
				+			/*
			
 
				+			 * we need to write the data to the defect sector. the
			
 
				+			 * data that was in that sector is not in memory,
			
 
				+			 * because the page was modified. we must not write the
			
 
				+			 * modified page to that sector.
			
 
				+			 *
			
 
				+			 * TODO: what could be done here: wait for the delalloc
			
 
				+			 *       runner to write out that page (might involve
			
 
				+			 *       COW) and see whether the sector is still
			
 
				+			 *       referenced afterwards.
			
 
				+			 *
			
 
				+			 * For the meantime, we'll treat this error
			
 
				+			 * incorrectable, although there is a chance that a
			
 
				+			 * later scrub will find the bad sector again and that
			
 
				+			 * there's no dirty page in memory, then.
			
 
				+			 */
			
 
				+			ret = -EIO;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
			
 
				+		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
			
 
				+					fixup->logical, page,
			
 
				+					fixup->mirror_num);
			
 
				+		unlock_page(page);
			
 
				+		corrected = !ret;
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * we need to get good data first. the general readpage path
			
 
				+		 * will call repair_io_failure for us, we just have to make
			
 
				+		 * sure we read the bad mirror.
			
 
				+		 */
			
 
				+		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
			
 
				+					EXTENT_DAMAGED, GFP_NOFS);
			
 
				+		if (ret) {
			
 
				+			/* set_extent_bits should give proper error */
			
 
				+			WARN_ON(ret > 0);
			
 
				+			if (ret > 0)
			
 
				+				ret = -EFAULT;
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
			
 
				+						btrfs_get_extent,
			
 
				+						fixup->mirror_num);
			
 
				+		wait_on_page_locked(page);
			
 
				+
			
 
				+		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
			
 
				+						end, EXTENT_DAMAGED, 0, NULL);
			
 
				+		if (!corrected)
			
 
				+			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
			
 
				+						EXTENT_DAMAGED, GFP_NOFS);
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	if (page)
			
 
				+		put_page(page);
			
 
				+	if (inode)
			
 
				+		iput(inode);
			
 
				+
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	if (ret == 0 && corrected) {
			
 
				+		/*
			
 
				+		 * we only need to call readpage for one of the inodes belonging
			
 
				+		 * to this extent. so make iterate_extent_inodes stop
			
 
				+		 */
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return -EIO;
			
 
				+}
			
 
				+
			
 
				+static void scrub_fixup_nodatasum(struct btrfs_work *work)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct scrub_fixup_nodatasum *fixup;
			
 
				+	struct scrub_dev *sdev;
			
 
				+	struct btrfs_trans_handle *trans = NULL;
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				+	struct btrfs_path *path;
			
 
				+	int uncorrectable = 0;
			
 
				+
			
 
				+	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
			
 
				+	sdev = fixup->sdev;
			
 
				+	fs_info = fixup->root->fs_info;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		spin_lock(&sdev->stat_lock);
			
 
				+		++sdev->stat.malloc_errors;
			
 
				+		spin_unlock(&sdev->stat_lock);
			
 
				+		uncorrectable = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	trans = btrfs_join_transaction(fixup->root);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		uncorrectable = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * the idea is to trigger a regular read through the standard path. we
			
 
				+	 * read a page from the (failed) logical address by specifying the
			
 
				+	 * corresponding copynum of the failed sector. thus, that readpage is
			
 
				+	 * expected to fail.
			
 
				+	 * that is the point where on-the-fly error correction will kick in
			
 
				+	 * (once it's finished) and rewrite the failed sector if a good copy
			
 
				+	 * can be found.
			
 
				+	 */
			
 
				+	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
			
 
				+						path, scrub_fixup_readpage,
			
 
				+						fixup);
			
 
				+	if (ret < 0) {
			
 
				+		uncorrectable = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	WARN_ON(ret != 1);
			
 
				+
			
 
				+	spin_lock(&sdev->stat_lock);
			
 
				+	++sdev->stat.corrected_errors;
			
 
				+	spin_unlock(&sdev->stat_lock);
			
 
				+
			
 
				+out:
			
 
				+	if (trans && !IS_ERR(trans))
			
 
				+		btrfs_end_transaction(trans, fixup->root);
			
 
				+	if (uncorrectable) {
			
 
				+		spin_lock(&sdev->stat_lock);
			
 
				+		++sdev->stat.uncorrectable_errors;
			
 
				+		spin_unlock(&sdev->stat_lock);
			
 
				+		printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
			
 
				+					"(nodatasum) error at logical %llu\n",
			
 
				+					fixup->logical);
			
 
				+	}
			
 
				+
			
 
				+	btrfs_free_path(path);
			
 
				+	kfree(fixup);
			
 
				+
			
 
				+	/* see caller why we're pretending to be paused in the scrub counters */
			
 
				+	mutex_lock(&fs_info->scrub_lock);
			
 
				+	atomic_dec(&fs_info->scrubs_running);
			
 
				+	atomic_dec(&fs_info->scrubs_paused);
			
 
				+	mutex_unlock(&fs_info->scrub_lock);
			
 
				+	atomic_dec(&sdev->fixup_cnt);
			
 
				+	wake_up(&fs_info->scrub_pause_wait);
			
 
				+	wake_up(&sdev->list_wait);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * scrub_recheck_error gets called when either verification of the page
			
 
				  * failed or the bio failed to read, e.g. with EIO. In the latter case,
			
 
				  * recheck_error gets called for every page in the bio, even though only
			
 
				  * one may be bad
			
 
				  */
			
 
				-static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
			
 
				+static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
			
 
				 {
			
 
				+	struct scrub_dev *sdev = sbio->sdev;
			
 
				+	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
			
 
				+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
			
 
				+					DEFAULT_RATELIMIT_BURST);
			
 
				+
			
 
				 	if (sbio->err) {
			
 
				-		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
			
 
				-				   (sbio->physical + ix * PAGE_SIZE) >> 9,
			
 
				+		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
			
 
				 				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
			
 
				 			if (scrub_fixup_check(sbio, ix) == 0)
			
 
				-				return;
			
 
				+				return 0;
			
 
				 		}
			
 
				+		if (__ratelimit(&_rs))
			
 
				+			scrub_print_warning("i/o error", sbio, ix);
			
 
				+	} else {
			
 
				+		if (__ratelimit(&_rs))
			
 
				+			scrub_print_warning("checksum error", sbio, ix);
			
 
				 	}
			
 
				 
			
 
				+	spin_lock(&sdev->stat_lock);
			
 
				+	++sdev->stat.read_errors;
			
 
				+	spin_unlock(&sdev->stat_lock);
			
 
				+
			
 
				 	scrub_fixup(sbio, ix);
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
			
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
				 	struct scrub_dev *sdev = sbio->sdev;
			
 
				 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
			
 
				 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
			
 
				-	struct btrfs_multi_bio *multi = NULL;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				+	struct scrub_fixup_nodatasum *fixup;
			
 
				 	u64 logical = sbio->logical + ix * PAGE_SIZE;
			
 
				 	u64 length;
			
 
				 	int i;
			
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
				 
			
 
				 	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
			
 
				 	    (sbio->spag[ix].have_csum == 0)) {
			
 
				+		fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
			
 
				+		if (!fixup)
			
 
				+			goto uncorrectable;
			
 
				+		fixup->sdev = sdev;
			
 
				+		fixup->logical = logical;
			
 
				+		fixup->root = fs_info->extent_root;
			
 
				+		fixup->mirror_num = sbio->spag[ix].mirror_num;
			
 
				 		/*
			
 
				-		 * nodatasum, don't try to fix anything
			
 
				-		 * FIXME: we can do better, open the inode and trigger a
			
 
				-		 * writeback
			
 
				+		 * increment scrubs_running to prevent cancel requests from
			
 
				+		 * completing as long as a fixup worker is running. we must also
			
 
				+		 * increment scrubs_paused to prevent deadlocking on pause
			
 
				+		 * requests used for transactions commits (as the worker uses a
			
 
				+		 * transaction context). it is safe to regard the fixup worker
			
 
				+		 * as paused for all matters practical. effectively, we only
			
 
				+		 * avoid cancellation requests from completing.
			
 
				 		 */
			
 
				-		goto uncorrectable;
			
 
				+		mutex_lock(&fs_info->scrub_lock);
			
 
				+		atomic_inc(&fs_info->scrubs_running);
			
 
				+		atomic_inc(&fs_info->scrubs_paused);
			
 
				+		mutex_unlock(&fs_info->scrub_lock);
			
 
				+		atomic_inc(&sdev->fixup_cnt);
			
 
				+		fixup->work.func = scrub_fixup_nodatasum;
			
 
				+		btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				 	length = PAGE_SIZE;
			
 
				 	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
			
 
				-			      &multi, 0);
			
 
				-	if (ret || !multi || length < PAGE_SIZE) {
			
 
				+			      &bbio, 0);
			
 
				+	if (ret || !bbio || length < PAGE_SIZE) {
			
 
				 		printk(KERN_ERR
			
 
				 		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
			
 
				 		       (unsigned long long)logical);
			
 
				 		WARN_ON(1);
			
 
				+		kfree(bbio);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	if (multi->num_stripes == 1)
			
 
				+	if (bbio->num_stripes == 1)
			
 
				 		/* there aren't any replicas */
			
 
				 		goto uncorrectable;
			
 
				 
			
 
				 	/*
			
 
				 	 * first find a good copy
			
 
				 	 */
			
 
				-	for (i = 0; i < multi->num_stripes; ++i) {
			
 
				-		if (i == sbio->spag[ix].mirror_num)
			
 
				+	for (i = 0; i < bbio->num_stripes; ++i) {
			
 
				+		if (i + 1 == sbio->spag[ix].mirror_num)
			
 
				 			continue;
			
 
				 
			
 
				-		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
			
 
				-				   multi->stripes[i].physical >> 9,
			
 
				+		if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
			
 
				+				   bbio->stripes[i].physical >> 9,
			
 
				 				   sbio->bio->bi_io_vec[ix].bv_page)) {
			
 
				 			/* I/O-error, this is not a good copy */
			
 
				 			continue;
			
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
				 		if (scrub_fixup_check(sbio, ix) == 0)
			
 
				 			break;
			
 
				 	}
			
 
				-	if (i == multi->num_stripes)
			
 
				+	if (i == bbio->num_stripes)
			
 
				 		goto uncorrectable;
			
 
				 
			
 
				 	if (!sdev->readonly) {
			
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	kfree(multi);
			
 
				+	kfree(bbio);
			
 
				 	spin_lock(&sdev->stat_lock);
			
 
				 	++sdev->stat.corrected_errors;
			
 
				 	spin_unlock(&sdev->stat_lock);
			
 
				 
			
 
				-	if (printk_ratelimit())
			
 
				-		printk(KERN_ERR "btrfs: fixed up at %llu\n",
			
 
				-		       (unsigned long long)logical);
			
 
				+	printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
			
 
				+			       (unsigned long long)logical);
			
 
				 	return;
			
 
				 
			
 
				 uncorrectable:
			
 
				-	kfree(multi);
			
 
				+	kfree(bbio);
			
 
				 	spin_lock(&sdev->stat_lock);
			
 
				 	++sdev->stat.uncorrectable_errors;
			
 
				 	spin_unlock(&sdev->stat_lock);
			
 
				 
			
 
				-	if (printk_ratelimit())
			
 
				-		printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
			
 
				-			 (unsigned long long)logical);
			
 
				+	printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
			
 
				+				"logical %llu\n", (unsigned long long)logical);
			
 
				 }
			
 
				 
			
 
				 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
			
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)
 
				 	int ret;
			
 
				 
			
 
				 	if (sbio->err) {
			
 
				+		ret = 0;
			
 
				 		for (i = 0; i < sbio->count; ++i)
			
 
				-			scrub_recheck_error(sbio, i);
			
 
				+			ret |= scrub_recheck_error(sbio, i);
			
 
				+		if (!ret) {
			
 
				+			spin_lock(&sdev->stat_lock);
			
 
				+			++sdev->stat.unverified_errors;
			
 
				+			spin_unlock(&sdev->stat_lock);
			
 
				+		}
			
 
				 
			
 
				 		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
			
 
				 		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
			
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)
 
				 			bi->bv_offset = 0;
			
 
				 			bi->bv_len = PAGE_SIZE;
			
 
				 		}
			
 
				-
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		++sdev->stat.read_errors;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				 		goto out;
			
 
				 	}
			
 
				 	for (i = 0; i < sbio->count; ++i) {
			
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)
 
				 			WARN_ON(1);
			
 
				 		}
			
 
				 		kunmap_atomic(buffer, KM_USER0);
			
 
				-		if (ret)
			
 
				-			scrub_recheck_error(sbio, i);
			
 
				+		if (ret) {
			
 
				+			ret = scrub_recheck_error(sbio, i);
			
 
				+			if (!ret) {
			
 
				+				spin_lock(&sdev->stat_lock);
			
 
				+				++sdev->stat.unverified_errors;
			
 
				+				spin_unlock(&sdev->stat_lock);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 out:
			
@@ -604,7 +991,7 @@ static int scrub_submit(struct scrub_dev *sdev)
 
				 }
			
 
				 
			
 
				 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
			
 
				-		      u64 physical, u64 flags, u64 gen, u64 mirror_num,
			
 
				+		      u64 physical, u64 flags, u64 gen, int mirror_num,
			
 
				 		      u8 *csum, int force)
			
 
				 {
			
 
				 	struct scrub_bio *sbio;
			
@@ -701,7 +1088,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 
			
 
				 /* scrub extent tries to collect up to 64 kB for each bio */
			
 
				 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
			
 
				-			u64 physical, u64 flags, u64 gen, u64 mirror_num)
			
 
				+			u64 physical, u64 flags, u64 gen, int mirror_num)
			
 
				 {
			
 
				 	int ret;
			
 
				 	u8 csum[BTRFS_CSUM_SIZE];
			
@@ -741,13 +1128,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 	int slot;
			
 
				 	int i;
			
 
				 	u64 nstripes;
			
 
				-	int start_stripe;
			
 
				 	struct extent_buffer *l;
			
 
				 	struct btrfs_key key;
			
 
				 	u64 physical;
			
 
				 	u64 logical;
			
 
				 	u64 generation;
			
 
				-	u64 mirror_num;
			
 
				+	int mirror_num;
			
 
				+	struct reada_control *reada1;
			
 
				+	struct reada_control *reada2;
			
 
				+	struct btrfs_key key_start;
			
 
				+	struct btrfs_key key_end;
			
 
				 
			
 
				 	u64 increment = map->stripe_len;
			
 
				 	u64 offset;
			
@@ -758,102 +1148,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			
 
				 		offset = map->stripe_len * num;
			
 
				 		increment = map->stripe_len * map->num_stripes;
			
 
				-		mirror_num = 0;
			
 
				+		mirror_num = 1;
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			
 
				 		int factor = map->num_stripes / map->sub_stripes;
			
 
				 		offset = map->stripe_len * (num / map->sub_stripes);
			
 
				 		increment = map->stripe_len * factor;
			
 
				-		mirror_num = num % map->sub_stripes;
			
 
				+		mirror_num = num % map->sub_stripes + 1;
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
			
 
				 		increment = map->stripe_len;
			
 
				-		mirror_num = num % map->num_stripes;
			
 
				+		mirror_num = num % map->num_stripes + 1;
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
			
 
				 		increment = map->stripe_len;
			
 
				-		mirror_num = num % map->num_stripes;
			
 
				+		mirror_num = num % map->num_stripes + 1;
			
 
				 	} else {
			
 
				 		increment = map->stripe_len;
			
 
				-		mirror_num = 0;
			
 
				+		mirror_num = 1;
			
 
				 	}
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	path->reada = 2;
			
 
				 	path->search_commit_root = 1;
			
 
				 	path->skip_locking = 1;
			
 
				 
			
 
				 	/*
			
 
				-	 * find all extents for each stripe and just read them to get
			
 
				-	 * them into the page cache
			
 
				-	 * FIXME: we can do better. build a more intelligent prefetching
			
 
				+	 * trigger the readahead for extent tree csum tree and wait for
			
 
				+	 * completion. During readahead, the scrub is officially paused
			
 
				+	 * to not hold off transaction commits
			
 
				 	 */
			
 
				 	logical = base + offset;
			
 
				-	physical = map->stripes[num].physical;
			
 
				-	ret = 0;
			
 
				-	for (i = 0; i < nstripes; ++i) {
			
 
				-		key.objectid = logical;
			
 
				-		key.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				-		key.offset = (u64)0;
			
 
				-
			
 
				-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				-		if (ret < 0)
			
 
				-			goto out_noplug;
			
 
				-
			
 
				-		/*
			
 
				-		 * we might miss half an extent here, but that doesn't matter,
			
 
				-		 * as it's only the prefetch
			
 
				-		 */
			
 
				-		while (1) {
			
 
				-			l = path->nodes[0];
			
 
				-			slot = path->slots[0];
			
 
				-			if (slot >= btrfs_header_nritems(l)) {
			
 
				-				ret = btrfs_next_leaf(root, path);
			
 
				-				if (ret == 0)
			
 
				-					continue;
			
 
				-				if (ret < 0)
			
 
				-					goto out_noplug;
			
 
				 
			
 
				-				break;
			
 
				-			}
			
 
				-			btrfs_item_key_to_cpu(l, &key, slot);
			
 
				+	wait_event(sdev->list_wait,
			
 
				+		   atomic_read(&sdev->in_flight) == 0);
			
 
				+	atomic_inc(&fs_info->scrubs_paused);
			
 
				+	wake_up(&fs_info->scrub_pause_wait);
			
 
				 
			
 
				-			if (key.objectid >= logical + map->stripe_len)
			
 
				-				break;
			
 
				+	/* FIXME it might be better to start readahead at commit root */
			
 
				+	key_start.objectid = logical;
			
 
				+	key_start.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				+	key_start.offset = (u64)0;
			
 
				+	key_end.objectid = base + offset + nstripes * increment;
			
 
				+	key_end.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				+	key_end.offset = (u64)0;
			
 
				+	reada1 = btrfs_reada_add(root, &key_start, &key_end);
			
 
				+
			
 
				+	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
			
 
				+	key_start.type = BTRFS_EXTENT_CSUM_KEY;
			
 
				+	key_start.offset = logical;
			
 
				+	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
			
 
				+	key_end.type = BTRFS_EXTENT_CSUM_KEY;
			
 
				+	key_end.offset = base + offset + nstripes * increment;
			
 
				+	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
			
 
				+
			
 
				+	if (!IS_ERR(reada1))
			
 
				+		btrfs_reada_wait(reada1);
			
 
				+	if (!IS_ERR(reada2))
			
 
				+		btrfs_reada_wait(reada2);
			
 
				 
			
 
				-			path->slots[0]++;
			
 
				-		}
			
 
				-		btrfs_release_path(path);
			
 
				-		logical += increment;
			
 
				-		physical += map->stripe_len;
			
 
				-		cond_resched();
			
 
				+	mutex_lock(&fs_info->scrub_lock);
			
 
				+	while (atomic_read(&fs_info->scrub_pause_req)) {
			
 
				+		mutex_unlock(&fs_info->scrub_lock);
			
 
				+		wait_event(fs_info->scrub_pause_wait,
			
 
				+		   atomic_read(&fs_info->scrub_pause_req) == 0);
			
 
				+		mutex_lock(&fs_info->scrub_lock);
			
 
				 	}
			
 
				+	atomic_dec(&fs_info->scrubs_paused);
			
 
				+	mutex_unlock(&fs_info->scrub_lock);
			
 
				+	wake_up(&fs_info->scrub_pause_wait);
			
 
				 
			
 
				 	/*
			
 
				 	 * collect all data csums for the stripe to avoid seeking during
			
 
				 	 * the scrub. This might currently (crc32) end up to be about 1MB
			
 
				 	 */
			
 
				-	start_stripe = 0;
			
 
				 	blk_start_plug(&plug);
			
 
				-again:
			
 
				-	logical = base + offset + start_stripe * increment;
			
 
				-	for (i = start_stripe; i < nstripes; ++i) {
			
 
				-		ret = btrfs_lookup_csums_range(csum_root, logical,
			
 
				-					       logical + map->stripe_len - 1,
			
 
				-					       &sdev->csum_list, 1);
			
 
				-		if (ret)
			
 
				-			goto out;
			
 
				 
			
 
				-		logical += increment;
			
 
				-		cond_resched();
			
 
				-	}
			
 
				 	/*
			
 
				 	 * now find all extents for each stripe and scrub them
			
 
				 	 */
			
 
				-	logical = base + offset + start_stripe * increment;
			
 
				-	physical = map->stripes[num].physical + start_stripe * map->stripe_len;
			
 
				+	logical = base + offset;
			
 
				+	physical = map->stripes[num].physical;
			
 
				 	ret = 0;
			
 
				-	for (i = start_stripe; i < nstripes; ++i) {
			
 
				+	for (i = 0; i < nstripes; ++i) {
			
 
				 		/*
			
 
				 		 * canceled?
			
 
				 		 */
			
@@ -882,11 +1258,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 			atomic_dec(&fs_info->scrubs_paused);
			
 
				 			mutex_unlock(&fs_info->scrub_lock);
			
 
				 			wake_up(&fs_info->scrub_pause_wait);
			
 
				-			scrub_free_csums(sdev);
			
 
				-			start_stripe = i;
			
 
				-			goto again;
			
 
				 		}
			
 
				 
			
 
				+		ret = btrfs_lookup_csums_range(csum_root, logical,
			
 
				+					       logical + map->stripe_len - 1,
			
 
				+					       &sdev->csum_list, 1);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+
			
 
				 		key.objectid = logical;
			
 
				 		key.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				 		key.offset = (u64)0;
			
@@ -982,7 +1361,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 
			
 
				 out:
			
 
				 	blk_finish_plug(&plug);
			
 
				-out_noplug:
			
 
				 	btrfs_free_path(path);
			
 
				 	return ret < 0 ? ret : 0;
			
 
				 }
			
@@ -1253,10 +1631,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 
				 		ret = scrub_enumerate_chunks(sdev, start, end);
			
 
				 
			
 
				 	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
			
 
				-
			
 
				 	atomic_dec(&fs_info->scrubs_running);
			
 
				 	wake_up(&fs_info->scrub_pause_wait);
			
 
				 
			
 
				+	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
			
 
				+
			
 
				 	if (progress)
			
 
				 		memcpy(progress, &sdev->stat, sizeof(*progress));
			
 
				 
			
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
 
				 #include <linux/magic.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/cleancache.h>
			
 
				+#include <linux/mnt_namespace.h>
			
 
				 #include "compat.h"
			
 
				 #include "delayed-inode.h"
			
 
				 #include "ctree.h"
			
@@ -58,6 +59,7 @@
 
				 #include <trace/events/btrfs.h>
			
 
				 
			
 
				 static const struct super_operations btrfs_super_ops;
			
 
				+static struct file_system_type btrfs_fs_type;
			
 
				 
			
 
				 static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
			
 
				 				      char nbuf[16])
			
@@ -162,7 +164,7 @@ enum {
 
				 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
			
 
				 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
			
 
				 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
			
 
				-	Opt_inode_cache, Opt_err,
			
 
				+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
			
 
				 };
			
 
				 
			
 
				 static match_table_t tokens = {
			
@@ -195,6 +197,8 @@ static match_table_t tokens = {
 
				 	{Opt_subvolrootid, "subvolrootid=%d"},
			
 
				 	{Opt_defrag, "autodefrag"},
			
 
				 	{Opt_inode_cache, "inode_cache"},
			
 
				+	{Opt_no_space_cache, "no_space_cache"},
			
 
				+	{Opt_recovery, "recovery"},
			
 
				 	{Opt_err, NULL},
			
 
				 };
			
 
				 
			
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 {
			
 
				 	struct btrfs_fs_info *info = root->fs_info;
			
 
				 	substring_t args[MAX_OPT_ARGS];
			
 
				-	char *p, *num, *orig;
			
 
				+	char *p, *num, *orig = NULL;
			
 
				+	u64 cache_gen;
			
 
				 	int intarg;
			
 
				 	int ret = 0;
			
 
				 	char *compress_type;
			
 
				 	bool compress_force = false;
			
 
				 
			
 
				+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
			
 
				+	if (cache_gen)
			
 
				+		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
			
 
				+
			
 
				 	if (!options)
			
 
				-		return 0;
			
 
				+		goto out;
			
 
				 
			
 
				 	/*
			
 
				 	 * strsep changes the string, duplicate it because parse_options
			
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 			btrfs_set_opt(info->mount_opt, DISCARD);
			
 
				 			break;
			
 
				 		case Opt_space_cache:
			
 
				-			printk(KERN_INFO "btrfs: enabling disk space caching\n");
			
 
				 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
			
 
				 			break;
			
 
				+		case Opt_no_space_cache:
			
 
				+			printk(KERN_INFO "btrfs: disabling disk space caching\n");
			
 
				+			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
			
 
				+			break;
			
 
				 		case Opt_inode_cache:
			
 
				 			printk(KERN_INFO "btrfs: enabling inode map caching\n");
			
 
				 			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
			
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 			printk(KERN_INFO "btrfs: enabling auto defrag");
			
 
				 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
			
 
				 			break;
			
 
				+		case Opt_recovery:
			
 
				+			printk(KERN_INFO "btrfs: enabling auto recovery");
			
 
				+			btrfs_set_opt(info->mount_opt, RECOVERY);
			
 
				+			break;
			
 
				 		case Opt_err:
			
 
				 			printk(KERN_INFO "btrfs: unrecognized mount option "
			
 
				 			       "'%s'\n", p);
			
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 		}
			
 
				 	}
			
 
				 out:
			
 
				+	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
			
 
				+		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
			
 
				 	kfree(orig);
			
 
				 	return ret;
			
 
				 }
			
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 
				 		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
			
 
				 {
			
 
				 	substring_t args[MAX_OPT_ARGS];
			
 
				-	char *opts, *orig, *p;
			
 
				+	char *device_name, *opts, *orig, *p;
			
 
				 	int error = 0;
			
 
				 	int intarg;
			
 
				 
			
 
				 	if (!options)
			
 
				-		goto out;
			
 
				+		return 0;
			
 
				 
			
 
				 	/*
			
 
				 	 * strsep changes the string, duplicate it because parse_options
			
@@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 
				 			}
			
 
				 			break;
			
 
				 		case Opt_device:
			
 
				-			error = btrfs_scan_one_device(match_strdup(&args[0]),
			
 
				+			device_name = match_strdup(&args[0]);
			
 
				+			if (!device_name) {
			
 
				+				error = -ENOMEM;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			error = btrfs_scan_one_device(device_name,
			
 
				 					flags, holder, fs_devices);
			
 
				+			kfree(device_name);
			
 
				 			if (error)
			
 
				-				goto out_free_opts;
			
 
				+				goto out;
			
 
				 			break;
			
 
				 		default:
			
 
				 			break;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				- out_free_opts:
			
 
				+out:
			
 
				 	kfree(orig);
			
 
				- out:
			
 
				-	/*
			
 
				-	 * If no subvolume name is specified we use the default one.  Allocate
			
 
				-	 * a copy of the string "." here so that code later in the
			
 
				-	 * mount path doesn't care if it's the default volume or another one.
			
 
				-	 */
			
 
				-	if (!*subvol_name) {
			
 
				-		*subvol_name = kstrdup(".", GFP_KERNEL);
			
 
				-		if (!*subvol_name)
			
 
				-			return -ENOMEM;
			
 
				-	}
			
 
				 	return error;
			
 
				 }
			
 
				 
			
@@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_key location;
			
 
				 	struct inode *inode;
			
 
				-	struct dentry *dentry;
			
 
				 	u64 dir_id;
			
 
				 	int new = 0;
			
 
				 
			
@@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 
				 	 * will mount by default if we haven't been given a specific subvolume
			
 
				 	 * to mount.
			
 
				 	 */
			
 
				-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
			
 
				+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
			
 
				 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
			
 
				 	if (IS_ERR(di)) {
			
 
				 		btrfs_free_path(path);
			
@@ -566,29 +578,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 
				 		return dget(sb->s_root);
			
 
				 	}
			
 
				 
			
 
				-	if (new) {
			
 
				-		const struct qstr name = { .name = "/", .len = 1 };
			
 
				-
			
 
				-		/*
			
 
				-		 * New inode, we need to make the dentry a sibling of s_root so
			
 
				-		 * everything gets cleaned up properly on unmount.
			
 
				-		 */
			
 
				-		dentry = d_alloc(sb->s_root, &name);
			
 
				-		if (!dentry) {
			
 
				-			iput(inode);
			
 
				-			return ERR_PTR(-ENOMEM);
			
 
				-		}
			
 
				-		d_splice_alias(inode, dentry);
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * We found the inode in cache, just find a dentry for it and
			
 
				-		 * put the reference to the inode we just got.
			
 
				-		 */
			
 
				-		dentry = d_find_alias(inode);
			
 
				-		iput(inode);
			
 
				-	}
			
 
				-
			
 
				-	return dentry;
			
 
				+	return d_obtain_alias(inode);
			
 
				 }
			
 
				 
			
 
				 static int btrfs_fill_super(struct super_block *sb,
			
@@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 
				 		seq_puts(seq, ",noacl");
			
 
				 	if (btrfs_test_opt(root, SPACE_CACHE))
			
 
				 		seq_puts(seq, ",space_cache");
			
 
				+	else
			
 
				+		seq_puts(seq, ",no_space_cache");
			
 
				 	if (btrfs_test_opt(root, CLEAR_CACHE))
			
 
				 		seq_puts(seq, ",clear_cache");
			
 
				 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
			
@@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
 
				 	return set_anon_super(s, data);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * subvolumes are identified by ino 256
			
 
				+ */
			
 
				+static inline int is_subvolume_inode(struct inode *inode)
			
 
				+{
			
 
				+	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This will strip out the subvol=%s argument for an argument string and add
			
 
				+ * subvolid=0 to make sure we get the actual tree root for path walking to the
			
 
				+ * subvol we want.
			
 
				+ */
			
 
				+static char *setup_root_args(char *args)
			
 
				+{
			
 
				+	unsigned copied = 0;
			
 
				+	unsigned len = strlen(args) + 2;
			
 
				+	char *pos;
			
 
				+	char *ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * We need the same args as before, but minus
			
 
				+	 *
			
 
				+	 * subvol=a
			
 
				+	 *
			
 
				+	 * and add
			
 
				+	 *
			
 
				+	 * subvolid=0
			
 
				+	 *
			
 
				+	 * which is a difference of 2 characters, so we allocate strlen(args) +
			
 
				+	 * 2 characters.
			
 
				+	 */
			
 
				+	ret = kzalloc(len * sizeof(char), GFP_NOFS);
			
 
				+	if (!ret)
			
 
				+		return NULL;
			
 
				+	pos = strstr(args, "subvol=");
			
 
				+
			
 
				+	/* This shouldn't happen, but just in case.. */
			
 
				+	if (!pos) {
			
 
				+		kfree(ret);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The subvol=<> arg is not at the front of the string, copy everybody
			
 
				+	 * up to that into ret.
			
 
				+	 */
			
 
				+	if (pos != args) {
			
 
				+		*pos = '\0';
			
 
				+		strcpy(ret, args);
			
 
				+		copied += strlen(args);
			
 
				+		pos++;
			
 
				+	}
			
 
				+
			
 
				+	strncpy(ret + copied, "subvolid=0", len - copied);
			
 
				+
			
 
				+	/* Length of subvolid=0 */
			
 
				+	copied += 10;
			
 
				+
			
 
				+	/*
			
 
				+	 * If there is no , after the subvol= option then we know there's no
			
 
				+	 * other options and we can just return.
			
 
				+	 */
			
 
				+	pos = strchr(pos, ',');
			
 
				+	if (!pos)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* Copy the rest of the arguments into our buffer */
			
 
				+	strncpy(ret + copied, pos, len - copied);
			
 
				+	copied += strlen(pos);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct dentry *mount_subvol(const char *subvol_name, int flags,
			
 
				+				   const char *device_name, char *data)
			
 
				+{
			
 
				+	struct super_block *s;
			
 
				+	struct dentry *root;
			
 
				+	struct vfsmount *mnt;
			
 
				+	struct mnt_namespace *ns_private;
			
 
				+	char *newargs;
			
 
				+	struct path path;
			
 
				+	int error;
			
 
				+
			
 
				+	newargs = setup_root_args(data);
			
 
				+	if (!newargs)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
			
 
				+			     newargs);
			
 
				+	kfree(newargs);
			
 
				+	if (IS_ERR(mnt))
			
 
				+		return ERR_CAST(mnt);
			
 
				+
			
 
				+	ns_private = create_mnt_ns(mnt);
			
 
				+	if (IS_ERR(ns_private)) {
			
 
				+		mntput(mnt);
			
 
				+		return ERR_CAST(ns_private);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * This will trigger the automount of the subvol so we can just
			
 
				+	 * drop the mnt we have here and return the dentry that we
			
 
				+	 * found.
			
 
				+	 */
			
 
				+	error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
			
 
				+				LOOKUP_FOLLOW, &path);
			
 
				+	put_mnt_ns(ns_private);
			
 
				+	if (error)
			
 
				+		return ERR_PTR(error);
			
 
				+
			
 
				+	if (!is_subvolume_inode(path.dentry->d_inode)) {
			
 
				+		path_put(&path);
			
 
				+		mntput(mnt);
			
 
				+		error = -EINVAL;
			
 
				+		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
			
 
				+				subvol_name);
			
 
				+		return ERR_PTR(-EINVAL);
			
 
				+	}
			
 
				+
			
 
				+	/* Get a ref to the sb and the dentry we found and return it */
			
 
				+	s = path.mnt->mnt_sb;
			
 
				+	atomic_inc(&s->s_active);
			
 
				+	root = dget(path.dentry);
			
 
				+	path_put(&path);
			
 
				+	down_write(&s->s_umount);
			
 
				+
			
 
				+	return root;
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				  * Find a superblock for the given device / mount point.
			
@@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
				 	if (error)
			
 
				 		return ERR_PTR(error);
			
 
				 
			
 
				+	if (subvol_name) {
			
 
				+		root = mount_subvol(subvol_name, flags, device_name, data);
			
 
				+		kfree(subvol_name);
			
 
				+		return root;
			
 
				+	}
			
 
				+
			
 
				 	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
			
 
				 	if (error)
			
 
				-		goto error_free_subvol_name;
			
 
				+		return ERR_PTR(error);
			
 
				 
			
 
				 	error = btrfs_open_devices(fs_devices, mode, fs_type);
			
 
				 	if (error)
			
 
				-		goto error_free_subvol_name;
			
 
				+		return ERR_PTR(error);
			
 
				 
			
 
				 	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
			
 
				 		error = -EACCES;
			
@@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
				 	fs_info->fs_devices = fs_devices;
			
 
				 	tree_root->fs_info = fs_info;
			
 
				 
			
 
				+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
			
 
				+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
			
 
				+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
			
 
				+		error = -ENOMEM;
			
 
				+		goto error_close_devices;
			
 
				+	}
			
 
				+
			
 
				 	bdev = fs_devices->latest_bdev;
			
 
				 	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
			
 
				-	if (IS_ERR(s))
			
 
				-		goto error_s;
			
 
				+	if (IS_ERR(s)) {
			
 
				+		error = PTR_ERR(s);
			
 
				+		goto error_close_devices;
			
 
				+	}
			
 
				 
			
 
				 	if (s->s_root) {
			
 
				 		if ((flags ^ s->s_flags) & MS_RDONLY) {
			
 
				 			deactivate_locked_super(s);
			
 
				-			error = -EBUSY;
			
 
				-			goto error_close_devices;
			
 
				+			return ERR_PTR(-EBUSY);
			
 
				 		}
			
 
				 
			
 
				 		btrfs_close_devices(fs_devices);
			
 
				-		kfree(fs_info);
			
 
				+		free_fs_info(fs_info);
			
 
				 		kfree(tree_root);
			
 
				 	} else {
			
 
				 		char b[BDEVNAME_SIZE];
			
 
				 
			
 
				 		s->s_flags = flags | MS_NOSEC;
			
 
				 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
			
 
				+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
			
 
				 		error = btrfs_fill_super(s, fs_devices, data,
			
 
				 					 flags & MS_SILENT ? 1 : 0);
			
 
				 		if (error) {
			
 
				 			deactivate_locked_super(s);
			
 
				-			goto error_free_subvol_name;
			
 
				+			return ERR_PTR(error);
			
 
				 		}
			
 
				 
			
 
				-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
			
 
				 		s->s_flags |= MS_ACTIVE;
			
 
				 	}
			
 
				 
			
 
				-	/* if they gave us a subvolume name bind mount into that */
			
 
				-	if (strcmp(subvol_name, ".")) {
			
 
				-		struct dentry *new_root;
			
 
				-
			
 
				-		root = get_default_root(s, subvol_rootid);
			
 
				-		if (IS_ERR(root)) {
			
 
				-			error = PTR_ERR(root);
			
 
				-			deactivate_locked_super(s);
			
 
				-			goto error_free_subvol_name;
			
 
				-		}
			
 
				-
			
 
				-		mutex_lock(&root->d_inode->i_mutex);
			
 
				-		new_root = lookup_one_len(subvol_name, root,
			
 
				-				      strlen(subvol_name));
			
 
				-		mutex_unlock(&root->d_inode->i_mutex);
			
 
				-
			
 
				-		if (IS_ERR(new_root)) {
			
 
				-			dput(root);
			
 
				-			deactivate_locked_super(s);
			
 
				-			error = PTR_ERR(new_root);
			
 
				-			goto error_free_subvol_name;
			
 
				-		}
			
 
				-		if (!new_root->d_inode) {
			
 
				-			dput(root);
			
 
				-			dput(new_root);
			
 
				-			deactivate_locked_super(s);
			
 
				-			error = -ENXIO;
			
 
				-			goto error_free_subvol_name;
			
 
				-		}
			
 
				-		dput(root);
			
 
				-		root = new_root;
			
 
				-	} else {
			
 
				-		root = get_default_root(s, subvol_objectid);
			
 
				-		if (IS_ERR(root)) {
			
 
				-			error = PTR_ERR(root);
			
 
				-			deactivate_locked_super(s);
			
 
				-			goto error_free_subvol_name;
			
 
				-		}
			
 
				+	root = get_default_root(s, subvol_objectid);
			
 
				+	if (IS_ERR(root)) {
			
 
				+		deactivate_locked_super(s);
			
 
				+		return root;
			
 
				 	}
			
 
				 
			
 
				-	kfree(subvol_name);
			
 
				 	return root;
			
 
				 
			
 
				-error_s:
			
 
				-	error = PTR_ERR(s);
			
 
				 error_close_devices:
			
 
				 	btrfs_close_devices(fs_devices);
			
 
				-	kfree(fs_info);
			
 
				+	free_fs_info(fs_info);
			
 
				 	kfree(tree_root);
			
 
				-error_free_subvol_name:
			
 
				-	kfree(subvol_name);
			
 
				 	return ERR_PTR(error);
			
 
				 }
			
 
				 
			
@@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
				 		if (root->fs_info->fs_devices->rw_devices == 0)
			
 
				 			return -EACCES;
			
 
				 
			
 
				-		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
			
 
				+		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
			
 
				 			return -EINVAL;
			
 
				 
			
 
				 		ret = btrfs_cleanup_fs_roots(root->fs_info);
			
@@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
				 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
			
 
				 {
			
 
				 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
			
 
				-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
			
 
				+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
			
 
				 	struct list_head *head = &root->fs_info->space_info;
			
 
				 	struct btrfs_space_info *found;
			
 
				 	u64 total_used = 0;
			
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 
				 	struct btrfs_transaction *cur_trans;
			
 
				 
			
 
				 	spin_lock(&root->fs_info->trans_lock);
			
 
				+loop:
			
 
				 	if (root->fs_info->trans_no_join) {
			
 
				 		if (!nofail) {
			
 
				 			spin_unlock(&root->fs_info->trans_lock);
			
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 
				 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
			
 
				 	if (!cur_trans)
			
 
				 		return -ENOMEM;
			
 
				+
			
 
				 	spin_lock(&root->fs_info->trans_lock);
			
 
				 	if (root->fs_info->running_transaction) {
			
 
				+		/*
			
 
				+		 * someone started a transaction after we unlocked.  Make sure
			
 
				+		 * to redo the trans_no_join checks above
			
 
				+		 */
			
 
				 		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
			
 
				 		cur_trans = root->fs_info->running_transaction;
			
 
				-		atomic_inc(&cur_trans->use_count);
			
 
				-		atomic_inc(&cur_trans->num_writers);
			
 
				-		cur_trans->num_joined++;
			
 
				-		spin_unlock(&root->fs_info->trans_lock);
			
 
				-		return 0;
			
 
				+		goto loop;
			
 
				 	}
			
 
				+
			
 
				 	atomic_set(&cur_trans->num_writers, 1);
			
 
				 	cur_trans->num_joined = 0;
			
 
				 	init_waitqueue_head(&cur_trans->writer_wait);
			
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 
				 	 */
			
 
				 	if (num_items > 0 && root != root->fs_info->chunk_root) {
			
 
				 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
			
 
				-		ret = btrfs_block_rsv_add(NULL, root,
			
 
				+		ret = btrfs_block_rsv_add(root,
			
 
				 					  &root->fs_info->trans_block_rsv,
			
 
				 					  num_bytes);
			
 
				 		if (ret)
			
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 
				 				  struct btrfs_root *root)
			
 
				 {
			
 
				 	int ret;
			
 
				-	ret = btrfs_block_rsv_check(trans, root,
			
 
				-				    &root->fs_info->global_block_rsv, 0, 5);
			
 
				+
			
 
				+	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
			
 
				 	return ret ? 1 : 0;
			
 
				 }
			
 
				 
			
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 
				 				 struct btrfs_root *root)
			
 
				 {
			
 
				 	struct btrfs_transaction *cur_trans = trans->transaction;
			
 
				+	struct btrfs_block_rsv *rsv = trans->block_rsv;
			
 
				 	int updates;
			
 
				 
			
 
				 	smp_mb();
			
 
				 	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
			
 
				 		return 1;
			
 
				 
			
 
				+	/*
			
 
				+	 * We need to do this in case we're deleting csums so the global block
			
 
				+	 * rsv get's used instead of the csum block rsv.
			
 
				+	 */
			
 
				+	trans->block_rsv = NULL;
			
 
				+
			
 
				 	updates = trans->delayed_ref_updates;
			
 
				 	trans->delayed_ref_updates = 0;
			
 
				 	if (updates)
			
 
				 		btrfs_run_delayed_refs(trans, root, updates);
			
 
				 
			
 
				+	trans->block_rsv = rsv;
			
 
				+
			
 
				 	return should_end_transaction(trans, root);
			
 
				 }
			
 
				 
			
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				+	btrfs_trans_release_metadata(trans, root);
			
 
				+	trans->block_rsv = NULL;
			
 
				 	while (count < 4) {
			
 
				 		unsigned long cur = trans->delayed_ref_updates;
			
 
				 		trans->delayed_ref_updates = 0;
			
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
				 		count++;
			
 
				 	}
			
 
				 
			
 
				-	btrfs_trans_release_metadata(trans, root);
			
 
				-
			
 
				 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
			
 
				 	    should_end_transaction(trans, root)) {
			
 
				 		trans->transaction->blocked = 1;
			
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 
				 int btrfs_write_marked_extents(struct btrfs_root *root,
			
 
				 			       struct extent_io_tree *dirty_pages, int mark)
			
 
				 {
			
 
				-	int ret;
			
 
				 	int err = 0;
			
 
				 	int werr = 0;
			
 
				-	struct page *page;
			
 
				-	struct inode *btree_inode = root->fs_info->btree_inode;
			
 
				+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
			
 
				 	u64 start = 0;
			
 
				 	u64 end;
			
 
				-	unsigned long index;
			
 
				-
			
 
				-	while (1) {
			
 
				-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
			
 
				-					    mark);
			
 
				-		if (ret)
			
 
				-			break;
			
 
				-		while (start <= end) {
			
 
				-			cond_resched();
			
 
				-
			
 
				-			index = start >> PAGE_CACHE_SHIFT;
			
 
				-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
			
 
				-			page = find_get_page(btree_inode->i_mapping, index);
			
 
				-			if (!page)
			
 
				-				continue;
			
 
				-
			
 
				-			btree_lock_page_hook(page);
			
 
				-			if (!page->mapping) {
			
 
				-				unlock_page(page);
			
 
				-				page_cache_release(page);
			
 
				-				continue;
			
 
				-			}
			
 
				 
			
 
				-			if (PageWriteback(page)) {
			
 
				-				if (PageDirty(page))
			
 
				-					wait_on_page_writeback(page);
			
 
				-				else {
			
 
				-					unlock_page(page);
			
 
				-					page_cache_release(page);
			
 
				-					continue;
			
 
				-				}
			
 
				-			}
			
 
				-			err = write_one_page(page, 0);
			
 
				-			if (err)
			
 
				-				werr = err;
			
 
				-			page_cache_release(page);
			
 
				-		}
			
 
				+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
			
 
				+				      mark)) {
			
 
				+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
			
 
				+				   GFP_NOFS);
			
 
				+		err = filemap_fdatawrite_range(mapping, start, end);
			
 
				+		if (err)
			
 
				+			werr = err;
			
 
				+		cond_resched();
			
 
				+		start = end + 1;
			
 
				 	}
			
 
				 	if (err)
			
 
				 		werr = err;
			
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
				 int btrfs_wait_marked_extents(struct btrfs_root *root,
			
 
				 			      struct extent_io_tree *dirty_pages, int mark)
			
 
				 {
			
 
				-	int ret;
			
 
				 	int err = 0;
			
 
				 	int werr = 0;
			
 
				-	struct page *page;
			
 
				-	struct inode *btree_inode = root->fs_info->btree_inode;
			
 
				+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
			
 
				 	u64 start = 0;
			
 
				 	u64 end;
			
 
				-	unsigned long index;
			
 
				-
			
 
				-	while (1) {
			
 
				-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
			
 
				-					    mark);
			
 
				-		if (ret)
			
 
				-			break;
			
 
				 
			
 
				-		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
			
 
				-		while (start <= end) {
			
 
				-			index = start >> PAGE_CACHE_SHIFT;
			
 
				-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
			
 
				-			page = find_get_page(btree_inode->i_mapping, index);
			
 
				-			if (!page)
			
 
				-				continue;
			
 
				-			if (PageDirty(page)) {
			
 
				-				btree_lock_page_hook(page);
			
 
				-				wait_on_page_writeback(page);
			
 
				-				err = write_one_page(page, 0);
			
 
				-				if (err)
			
 
				-					werr = err;
			
 
				-			}
			
 
				-			wait_on_page_writeback(page);
			
 
				-			page_cache_release(page);
			
 
				-			cond_resched();
			
 
				-		}
			
 
				+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
			
 
				+				      EXTENT_NEED_WAIT)) {
			
 
				+		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
			
 
				+		err = filemap_fdatawait_range(mapping, start, end);
			
 
				+		if (err)
			
 
				+			werr = err;
			
 
				+		cond_resched();
			
 
				+		start = end + 1;
			
 
				 	}
			
 
				 	if (err)
			
 
				 		werr = err;
			
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 
				 
			
 
				 	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
			
 
				 	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
			
 
				-	return ret || ret2;
			
 
				+
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	if (ret2)
			
 
				+		return ret2;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
			
@@ -911,10 +880,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
			
 
				-	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
			
 
				 
			
 
				 	if (to_reserve > 0) {
			
 
				-		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
			
 
				+		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
			
 
				 					  to_reserve);
			
 
				 		if (ret) {
			
 
				 			pending->error = ret;
			
@@ -1002,7 +970,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 	BUG_ON(IS_ERR(pending->snap));
			
 
				 
			
 
				 	btrfs_reloc_post_snapshot(trans, pending);
			
 
				-	btrfs_orphan_post_snapshot(trans, pending);
			
 
				 fail:
			
 
				 	kfree(new_root_item);
			
 
				 	trans->block_rsv = rsv;
			
@@ -1032,7 +999,7 @@ static void update_super_roots(struct btrfs_root *root)
 
				 	struct btrfs_root_item *root_item;
			
 
				 	struct btrfs_super_block *super;
			
 
				 
			
 
				-	super = &root->fs_info->super_copy;
			
 
				+	super = root->fs_info->super_copy;
			
 
				 
			
 
				 	root_item = &root->fs_info->chunk_root->root_item;
			
 
				 	super->chunk_root = root_item->bytenr;
			
@@ -1043,7 +1010,7 @@ static void update_super_roots(struct btrfs_root *root)
 
				 	super->root = root_item->bytenr;
			
 
				 	super->generation = root_item->generation;
			
 
				 	super->root_level = root_item->level;
			
 
				-	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
			
 
				+	if (btrfs_test_opt(root, SPACE_CACHE))
			
 
				 		super->cache_generation = root_item->generation;
			
 
				 }
			
 
				 
			
@@ -1168,14 +1135,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	btrfs_run_ordered_operations(root, 0);
			
 
				 
			
 
				+	btrfs_trans_release_metadata(trans, root);
			
 
				+	trans->block_rsv = NULL;
			
 
				+
			
 
				 	/* make a pass through all the delayed refs we have so far
			
 
				 	 * any runnings procs may add more while we are here
			
 
				 	 */
			
 
				 	ret = btrfs_run_delayed_refs(trans, root, 0);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	btrfs_trans_release_metadata(trans, root);
			
 
				-
			
 
				 	cur_trans = trans->transaction;
			
 
				 	/*
			
 
				 	 * set the flushing flag so procs in this transaction have to
			
@@ -1341,12 +1309,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	update_super_roots(root);
			
 
				 
			
 
				 	if (!root->fs_info->log_root_recovering) {
			
 
				-		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
			
 
				-		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
			
 
				+		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
			
 
				+		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
			
 
				 	}
			
 
				 
			
 
				-	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
			
 
				-	       sizeof(root->fs_info->super_copy));
			
 
				+	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
			
 
				+	       sizeof(*root->fs_info->super_copy));
			
 
				 
			
 
				 	trans->transaction->blocked = 0;
			
 
				 	spin_lock(&root->fs_info->trans_lock);
			
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
 
				 			      struct walk_control *wc, u64 gen)
			
 
				 {
			
 
				 	if (wc->pin)
			
 
				-		btrfs_pin_extent(log->fs_info->extent_root,
			
 
				-				 eb->start, eb->len, 0);
			
 
				+		btrfs_pin_extent_for_log_replay(wc->trans,
			
 
				+						log->fs_info->extent_root,
			
 
				+						eb->start, eb->len);
			
 
				 
			
 
				 	if (btrfs_buffer_uptodate(eb, gen)) {
			
 
				 		if (wc->write)
			
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
				 
			
 
				 				WARN_ON(root_owner !=
			
 
				 					BTRFS_TREE_LOG_OBJECTID);
			
 
				-				ret = btrfs_free_reserved_extent(root,
			
 
				+				ret = btrfs_free_and_pin_reserved_extent(root,
			
 
				 							 bytenr, blocksize);
			
 
				 				BUG_ON(ret);
			
 
				 			}
			
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
				 				btrfs_tree_unlock(next);
			
 
				 
			
 
				 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
			
 
				-				ret = btrfs_free_reserved_extent(root,
			
 
				+				ret = btrfs_free_and_pin_reserved_extent(root,
			
 
				 						path->nodes[*level]->start,
			
 
				 						path->nodes[*level]->len);
			
 
				 				BUG_ON(ret);
			
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
				 
			
 
				 			WARN_ON(log->root_key.objectid !=
			
 
				 				BTRFS_TREE_LOG_OBJECTID);
			
 
				-			ret = btrfs_free_reserved_extent(log, next->start,
			
 
				+			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
			
 
				 							 next->len);
			
 
				 			BUG_ON(ret);
			
 
				 		}
			
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
				 	/* wait for previous tree log sync to complete */
			
 
				 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
			
 
				 		wait_log_commit(trans, root, root->log_transid - 1);
			
 
				-
			
 
				 	while (1) {
			
 
				 		unsigned long batch = root->log_batch;
			
 
				-		if (root->log_multiple_pids) {
			
 
				+		/* when we're on an ssd, just kick the log commit out */
			
 
				+		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
			
 
				 			mutex_unlock(&root->log_mutex);
			
 
				 			schedule_timeout_uninterruptible(1);
			
 
				 			mutex_lock(&root->log_mutex);
			
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
				 	BUG_ON(ret);
			
 
				 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
			
 
				 
			
 
				-	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
			
 
				+	btrfs_set_super_log_root(root->fs_info->super_for_commit,
			
 
				 				log_root_tree->node->start);
			
 
				-	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
			
 
				+	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
			
 
				 				btrfs_header_level(log_root_tree->node));
			
 
				 
			
 
				 	log_root_tree->log_batch = 0;
			
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
 
				 		}
			
 
				 		INIT_LIST_HEAD(&device->dev_alloc_list);
			
 
				 
			
 
				+		/* init readahead state */
			
 
				+		spin_lock_init(&device->reada_lock);
			
 
				+		device->reada_curr_zone = NULL;
			
 
				+		atomic_set(&device->reada_in_flight, 0);
			
 
				+		device->reada_next = 0;
			
 
				+		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
			
 
				+		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
			
 
				+
			
 
				 		mutex_lock(&fs_devices->device_list_mutex);
			
 
				 		list_add_rcu(&device->dev_list, &fs_devices->devices);
			
 
				 		mutex_unlock(&fs_devices->device_list_mutex);
			
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
				 		set_blocksize(bdev, 4096);
			
 
				 
			
 
				 		bh = btrfs_read_dev_super(bdev);
			
 
				-		if (!bh) {
			
 
				-			ret = -EINVAL;
			
 
				+		if (!bh)
			
 
				 			goto error_close;
			
 
				-		}
			
 
				 
			
 
				 		disk_super = (struct btrfs_super_block *)bh->b_data;
			
 
				 		devid = btrfs_stack_device_id(&disk_super->dev_item);
			
@@ -655,7 +661,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
				 		continue;
			
 
				 	}
			
 
				 	if (fs_devices->open_devices == 0) {
			
 
				-		ret = -EIO;
			
 
				+		ret = -EINVAL;
			
 
				 		goto out;
			
 
				 	}
			
 
				 	fs_devices->seeding = seeding;
			
@@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	if (device->bytes_used > 0)
			
 
				-		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
			
 
				+	if (device->bytes_used > 0) {
			
 
				+		u64 len = btrfs_dev_extent_length(leaf, extent);
			
 
				+		device->bytes_used -= len;
			
 
				+		spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+		root->fs_info->free_chunk_space += len;
			
 
				+		spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				+	}
			
 
				 	ret = btrfs_del_item(trans, root, path);
			
 
				 
			
 
				 out:
			
@@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	if (ret)
			
 
				 		goto error_undo;
			
 
				 
			
 
				+	spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+	root->fs_info->free_chunk_space = device->total_bytes -
			
 
				+		device->bytes_used;
			
 
				+	spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				+
			
 
				 	device->in_fs_metadata = 0;
			
 
				 	btrfs_scrub_cancel_dev(root, device);
			
 
				 
			
@@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	call_rcu(&device->rcu, free_device);
			
 
				 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
			
 
				-	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
			
 
				+	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
			
 
				+	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
			
 
				 
			
 
				 	if (cur_devices->open_devices == 0) {
			
 
				 		struct btrfs_fs_devices *fs_devices;
			
@@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
			
 
				 	struct btrfs_fs_devices *old_devices;
			
 
				 	struct btrfs_fs_devices *seed_devices;
			
 
				-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
			
 
				+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
			
 
				 	struct btrfs_device *device;
			
 
				 	u64 super_flags;
			
 
				 
			
@@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
				 		root->fs_info->fs_devices->num_can_discard++;
			
 
				 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
			
 
				 
			
 
				+	spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+	root->fs_info->free_chunk_space += device->total_bytes;
			
 
				+	spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				+
			
 
				 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
			
 
				 		root->fs_info->fs_devices->rotating = 1;
			
 
				 
			
 
				-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
			
 
				-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
			
 
				+	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
			
 
				+	btrfs_set_super_total_bytes(root->fs_info->super_copy,
			
 
				 				    total_bytes + device->total_bytes);
			
 
				 
			
 
				-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
			
 
				-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
			
 
				+	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
			
 
				+	btrfs_set_super_num_devices(root->fs_info->super_copy,
			
 
				 				    total_bytes + 1);
			
 
				 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 
			
@@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 
				 		      struct btrfs_device *device, u64 new_size)
			
 
				 {
			
 
				 	struct btrfs_super_block *super_copy =
			
 
				-		&device->dev_root->fs_info->super_copy;
			
 
				+		device->dev_root->fs_info->super_copy;
			
 
				 	u64 old_total = btrfs_super_total_bytes(super_copy);
			
 
				 	u64 diff = new_size - device->total_bytes;
			
 
				 
			
@@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 
				 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
			
 
				 			chunk_offset)
			
 
				 {
			
 
				-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
			
 
				+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
			
 
				 	struct btrfs_disk_key *disk_key;
			
 
				 	struct btrfs_chunk *chunk;
			
 
				 	u8 *ptr;
			
@@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 	bool retried = false;
			
 
				 	struct extent_buffer *l;
			
 
				 	struct btrfs_key key;
			
 
				-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
			
 
				+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
			
 
				 	u64 old_total = btrfs_super_total_bytes(super_copy);
			
 
				 	u64 old_size = device->total_bytes;
			
 
				 	u64 diff = device->total_bytes - new_size;
			
@@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 	lock_chunks(root);
			
 
				 
			
 
				 	device->total_bytes = new_size;
			
 
				-	if (device->writeable)
			
 
				+	if (device->writeable) {
			
 
				 		device->fs_devices->total_rw_bytes -= diff;
			
 
				+		spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+		root->fs_info->free_chunk_space -= diff;
			
 
				+		spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				+	}
			
 
				 	unlock_chunks(root);
			
 
				 
			
 
				 again:
			
@@ -2257,6 +2281,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 		device->total_bytes = old_size;
			
 
				 		if (device->writeable)
			
 
				 			device->fs_devices->total_rw_bytes += diff;
			
 
				+		spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+		root->fs_info->free_chunk_space += diff;
			
 
				+		spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				 		unlock_chunks(root);
			
 
				 		goto done;
			
 
				 	}
			
@@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 
				 			   struct btrfs_key *key,
			
 
				 			   struct btrfs_chunk *chunk, int item_size)
			
 
				 {
			
 
				-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
			
 
				+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
			
 
				 	struct btrfs_disk_key disk_key;
			
 
				 	u32 array_size;
			
 
				 	u8 *ptr;
			
@@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 
				 		index++;
			
 
				 	}
			
 
				 
			
 
				+	spin_lock(&extent_root->fs_info->free_chunk_lock);
			
 
				+	extent_root->fs_info->free_chunk_space -= (stripe_size *
			
 
				+						   map->num_stripes);
			
 
				+	spin_unlock(&extent_root->fs_info->free_chunk_lock);
			
 
				+
			
 
				 	index = 0;
			
 
				 	stripe = &chunk->stripe;
			
 
				 	while (index < map->num_stripes) {
			
@@ -2848,7 +2880,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 
				 
			
 
				 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
			
 
				 			     u64 logical, u64 *length,
			
 
				-			     struct btrfs_multi_bio **multi_ret,
			
 
				+			     struct btrfs_bio **bbio_ret,
			
 
				 			     int mirror_num)
			
 
				 {
			
 
				 	struct extent_map *em;
			
@@ -2866,18 +2898,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 	int i;
			
 
				 	int num_stripes;
			
 
				 	int max_errors = 0;
			
 
				-	struct btrfs_multi_bio *multi = NULL;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				 
			
 
				-	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
			
 
				+	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
			
 
				 		stripes_allocated = 1;
			
 
				 again:
			
 
				-	if (multi_ret) {
			
 
				-		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
			
 
				+	if (bbio_ret) {
			
 
				+		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
			
 
				 				GFP_NOFS);
			
 
				-		if (!multi)
			
 
				+		if (!bbio)
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		atomic_set(&multi->error, 0);
			
 
				+		atomic_set(&bbio->error, 0);
			
 
				 	}
			
 
				 
			
 
				 	read_lock(&em_tree->lock);
			
@@ -2898,7 +2930,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 	if (mirror_num > map->num_stripes)
			
 
				 		mirror_num = 0;
			
 
				 
			
 
				-	/* if our multi bio struct is too small, back off and try again */
			
 
				+	/* if our btrfs_bio struct is too small, back off and try again */
			
 
				 	if (rw & REQ_WRITE) {
			
 
				 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			
 
				 				 BTRFS_BLOCK_GROUP_DUP)) {
			
@@ -2917,11 +2949,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 			stripes_required = map->num_stripes;
			
 
				 		}
			
 
				 	}
			
 
				-	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
			
 
				+	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
			
 
				 	    stripes_allocated < stripes_required) {
			
 
				 		stripes_allocated = map->num_stripes;
			
 
				 		free_extent_map(em);
			
 
				-		kfree(multi);
			
 
				+		kfree(bbio);
			
 
				 		goto again;
			
 
				 	}
			
 
				 	stripe_nr = offset;
			
@@ -2950,7 +2982,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 		*length = em->len - offset;
			
 
				 	}
			
 
				 
			
 
				-	if (!multi_ret)
			
 
				+	if (!bbio_ret)
			
 
				 		goto out;
			
 
				 
			
 
				 	num_stripes = 1;
			
@@ -2975,13 +3007,17 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 			stripe_index = find_live_mirror(map, 0,
			
 
				 					    map->num_stripes,
			
 
				 					    current->pid % map->num_stripes);
			
 
				+			mirror_num = stripe_index + 1;
			
 
				 		}
			
 
				 
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
			
 
				-		if (rw & (REQ_WRITE | REQ_DISCARD))
			
 
				+		if (rw & (REQ_WRITE | REQ_DISCARD)) {
			
 
				 			num_stripes = map->num_stripes;
			
 
				-		else if (mirror_num)
			
 
				+		} else if (mirror_num) {
			
 
				 			stripe_index = mirror_num - 1;
			
 
				+		} else {
			
 
				+			mirror_num = 1;
			
 
				+		}
			
 
				 
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			
 
				 		int factor = map->num_stripes / map->sub_stripes;
			
@@ -3001,6 +3037,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 			stripe_index = find_live_mirror(map, stripe_index,
			
 
				 					      map->sub_stripes, stripe_index +
			
 
				 					      current->pid % map->sub_stripes);
			
 
				+			mirror_num = stripe_index + 1;
			
 
				 		}
			
 
				 	} else {
			
 
				 		/*
			
@@ -3009,15 +3046,16 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 		 * stripe_index is the number of our device in the stripe array
			
 
				 		 */
			
 
				 		stripe_index = do_div(stripe_nr, map->num_stripes);
			
 
				+		mirror_num = stripe_index + 1;
			
 
				 	}
			
 
				 	BUG_ON(stripe_index >= map->num_stripes);
			
 
				 
			
 
				 	if (rw & REQ_DISCARD) {
			
 
				 		for (i = 0; i < num_stripes; i++) {
			
 
				-			multi->stripes[i].physical =
			
 
				+			bbio->stripes[i].physical =
			
 
				 				map->stripes[stripe_index].physical +
			
 
				 				stripe_offset + stripe_nr * map->stripe_len;
			
 
				-			multi->stripes[i].dev = map->stripes[stripe_index].dev;
			
 
				+			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
			
 
				 
			
 
				 			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			
 
				 				u64 stripes;
			
@@ -3038,16 +3076,16 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 				}
			
 
				 				stripes = stripe_nr_end - 1 - j;
			
 
				 				do_div(stripes, map->num_stripes);
			
 
				-				multi->stripes[i].length = map->stripe_len *
			
 
				+				bbio->stripes[i].length = map->stripe_len *
			
 
				 					(stripes - stripe_nr + 1);
			
 
				 
			
 
				 				if (i == 0) {
			
 
				-					multi->stripes[i].length -=
			
 
				+					bbio->stripes[i].length -=
			
 
				 						stripe_offset;
			
 
				 					stripe_offset = 0;
			
 
				 				}
			
 
				 				if (stripe_index == last_stripe)
			
 
				-					multi->stripes[i].length -=
			
 
				+					bbio->stripes[i].length -=
			
 
				 						stripe_end_offset;
			
 
				 			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			
 
				 				u64 stripes;
			
@@ -3072,11 +3110,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 				}
			
 
				 				stripes = stripe_nr_end - 1 - j;
			
 
				 				do_div(stripes, factor);
			
 
				-				multi->stripes[i].length = map->stripe_len *
			
 
				+				bbio->stripes[i].length = map->stripe_len *
			
 
				 					(stripes - stripe_nr + 1);
			
 
				 
			
 
				 				if (i < map->sub_stripes) {
			
 
				-					multi->stripes[i].length -=
			
 
				+					bbio->stripes[i].length -=
			
 
				 						stripe_offset;
			
 
				 					if (i == map->sub_stripes - 1)
			
 
				 						stripe_offset = 0;
			
@@ -3084,11 +3122,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 				if (stripe_index >= last_stripe &&
			
 
				 				    stripe_index <= (last_stripe +
			
 
				 						     map->sub_stripes - 1)) {
			
 
				-					multi->stripes[i].length -=
			
 
				+					bbio->stripes[i].length -=
			
 
				 						stripe_end_offset;
			
 
				 				}
			
 
				 			} else
			
 
				-				multi->stripes[i].length = *length;
			
 
				+				bbio->stripes[i].length = *length;
			
 
				 
			
 
				 			stripe_index++;
			
 
				 			if (stripe_index == map->num_stripes) {
			
@@ -3099,19 +3137,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 		}
			
 
				 	} else {
			
 
				 		for (i = 0; i < num_stripes; i++) {
			
 
				-			multi->stripes[i].physical =
			
 
				+			bbio->stripes[i].physical =
			
 
				 				map->stripes[stripe_index].physical +
			
 
				 				stripe_offset +
			
 
				 				stripe_nr * map->stripe_len;
			
 
				-			multi->stripes[i].dev =
			
 
				+			bbio->stripes[i].dev =
			
 
				 				map->stripes[stripe_index].dev;
			
 
				 			stripe_index++;
			
 
				 		}
			
 
				 	}
			
 
				-	if (multi_ret) {
			
 
				-		*multi_ret = multi;
			
 
				-		multi->num_stripes = num_stripes;
			
 
				-		multi->max_errors = max_errors;
			
 
				+	if (bbio_ret) {
			
 
				+		*bbio_ret = bbio;
			
 
				+		bbio->num_stripes = num_stripes;
			
 
				+		bbio->max_errors = max_errors;
			
 
				+		bbio->mirror_num = mirror_num;
			
 
				 	}
			
 
				 out:
			
 
				 	free_extent_map(em);
			
@@ -3120,9 +3159,9 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 
			
 
				 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
			
 
				 		      u64 logical, u64 *length,
			
 
				-		      struct btrfs_multi_bio **multi_ret, int mirror_num)
			
 
				+		      struct btrfs_bio **bbio_ret, int mirror_num)
			
 
				 {
			
 
				-	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
			
 
				+	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
			
 
				 				 mirror_num);
			
 
				 }
			
 
				 
			
@@ -3191,28 +3230,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void end_bio_multi_stripe(struct bio *bio, int err)
			
 
				+static void btrfs_end_bio(struct bio *bio, int err)
			
 
				 {
			
 
				-	struct btrfs_multi_bio *multi = bio->bi_private;
			
 
				+	struct btrfs_bio *bbio = bio->bi_private;
			
 
				 	int is_orig_bio = 0;
			
 
				 
			
 
				 	if (err)
			
 
				-		atomic_inc(&multi->error);
			
 
				+		atomic_inc(&bbio->error);
			
 
				 
			
 
				-	if (bio == multi->orig_bio)
			
 
				+	if (bio == bbio->orig_bio)
			
 
				 		is_orig_bio = 1;
			
 
				 
			
 
				-	if (atomic_dec_and_test(&multi->stripes_pending)) {
			
 
				+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
			
 
				 		if (!is_orig_bio) {
			
 
				 			bio_put(bio);
			
 
				-			bio = multi->orig_bio;
			
 
				+			bio = bbio->orig_bio;
			
 
				 		}
			
 
				-		bio->bi_private = multi->private;
			
 
				-		bio->bi_end_io = multi->end_io;
			
 
				+		bio->bi_private = bbio->private;
			
 
				+		bio->bi_end_io = bbio->end_io;
			
 
				+		bio->bi_bdev = (struct block_device *)
			
 
				+					(unsigned long)bbio->mirror_num;
			
 
				 		/* only send an error to the higher layers if it is
			
 
				 		 * beyond the tolerance of the multi-bio
			
 
				 		 */
			
 
				-		if (atomic_read(&multi->error) > multi->max_errors) {
			
 
				+		if (atomic_read(&bbio->error) > bbio->max_errors) {
			
 
				 			err = -EIO;
			
 
				 		} else if (err) {
			
 
				 			/*
			
@@ -3222,7 +3263,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
 
				 			set_bit(BIO_UPTODATE, &bio->bi_flags);
			
 
				 			err = 0;
			
 
				 		}
			
 
				-		kfree(multi);
			
 
				+		kfree(bbio);
			
 
				 
			
 
				 		bio_endio(bio, err);
			
 
				 	} else if (!is_orig_bio) {
			
@@ -3302,20 +3343,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
				 	u64 logical = (u64)bio->bi_sector << 9;
			
 
				 	u64 length = 0;
			
 
				 	u64 map_length;
			
 
				-	struct btrfs_multi_bio *multi = NULL;
			
 
				 	int ret;
			
 
				 	int dev_nr = 0;
			
 
				 	int total_devs = 1;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				 
			
 
				 	length = bio->bi_size;
			
 
				 	map_tree = &root->fs_info->mapping_tree;
			
 
				 	map_length = length;
			
 
				 
			
 
				-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
			
 
				+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
			
 
				 			      mirror_num);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	total_devs = multi->num_stripes;
			
 
				+	total_devs = bbio->num_stripes;
			
 
				 	if (map_length < length) {
			
 
				 		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
			
 
				 		       "len %llu\n", (unsigned long long)logical,
			
@@ -3323,25 +3364,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
				 		       (unsigned long long)map_length);
			
 
				 		BUG();
			
 
				 	}
			
 
				-	multi->end_io = first_bio->bi_end_io;
			
 
				-	multi->private = first_bio->bi_private;
			
 
				-	multi->orig_bio = first_bio;
			
 
				-	atomic_set(&multi->stripes_pending, multi->num_stripes);
			
 
				+
			
 
				+	bbio->orig_bio = first_bio;
			
 
				+	bbio->private = first_bio->bi_private;
			
 
				+	bbio->end_io = first_bio->bi_end_io;
			
 
				+	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
			
 
				 
			
 
				 	while (dev_nr < total_devs) {
			
 
				-		if (total_devs > 1) {
			
 
				-			if (dev_nr < total_devs - 1) {
			
 
				-				bio = bio_clone(first_bio, GFP_NOFS);
			
 
				-				BUG_ON(!bio);
			
 
				-			} else {
			
 
				-				bio = first_bio;
			
 
				-			}
			
 
				-			bio->bi_private = multi;
			
 
				-			bio->bi_end_io = end_bio_multi_stripe;
			
 
				+		if (dev_nr < total_devs - 1) {
			
 
				+			bio = bio_clone(first_bio, GFP_NOFS);
			
 
				+			BUG_ON(!bio);
			
 
				+		} else {
			
 
				+			bio = first_bio;
			
 
				 		}
			
 
				-		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
			
 
				-		dev = multi->stripes[dev_nr].dev;
			
 
				+		bio->bi_private = bbio;
			
 
				+		bio->bi_end_io = btrfs_end_bio;
			
 
				+		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
			
 
				+		dev = bbio->stripes[dev_nr].dev;
			
 
				 		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
			
 
				+			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
			
 
				+				 "(%s id %llu), size=%u\n", rw,
			
 
				+				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
			
 
				+				 dev->name, dev->devid, bio->bi_size);
			
 
				 			bio->bi_bdev = dev->bdev;
			
 
				 			if (async_submit)
			
 
				 				schedule_bio(root, dev, rw, bio);
			
@@ -3354,8 +3398,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
				 		}
			
 
				 		dev_nr++;
			
 
				 	}
			
 
				-	if (total_devs == 1)
			
 
				-		kfree(multi);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -3616,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,
 
				 	fill_device_from_item(leaf, dev_item, device);
			
 
				 	device->dev_root = root->fs_info->dev_root;
			
 
				 	device->in_fs_metadata = 1;
			
 
				-	if (device->writeable)
			
 
				+	if (device->writeable) {
			
 
				 		device->fs_devices->total_rw_bytes += device->total_bytes;
			
 
				+		spin_lock(&root->fs_info->free_chunk_lock);
			
 
				+		root->fs_info->free_chunk_space += device->total_bytes -
			
 
				+			device->bytes_used;
			
 
				+		spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				+	}
			
 
				 	ret = 0;
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 int btrfs_read_sys_array(struct btrfs_root *root)
			
 
				 {
			
 
				-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
			
 
				+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
			
 
				 	struct extent_buffer *sb;
			
 
				 	struct btrfs_disk_key *disk_key;
			
 
				 	struct btrfs_chunk *chunk;
			
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
 
				 	struct btrfs_work work;
			
 
				 	struct rcu_head rcu;
			
 
				 	struct work_struct rcu_work;
			
 
				+
			
 
				+	/* readahead state */
			
 
				+	spinlock_t reada_lock;
			
 
				+	atomic_t reada_in_flight;
			
 
				+	u64 reada_next;
			
 
				+	struct reada_zone *reada_curr_zone;
			
 
				+	struct radix_tree_root reada_zones;
			
 
				+	struct radix_tree_root reada_extents;
			
 
				 };
			
 
				 
			
 
				 struct btrfs_fs_devices {
			
@@ -136,7 +144,10 @@ struct btrfs_bio_stripe {
 
				 	u64 length; /* only used for discard mappings */
			
 
				 };
			
 
				 
			
 
				-struct btrfs_multi_bio {
			
 
				+struct btrfs_bio;
			
 
				+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
			
 
				+
			
 
				+struct btrfs_bio {
			
 
				 	atomic_t stripes_pending;
			
 
				 	bio_end_io_t *end_io;
			
 
				 	struct bio *orig_bio;
			
@@ -144,6 +155,7 @@ struct btrfs_multi_bio {
 
				 	atomic_t error;
			
 
				 	int max_errors;
			
 
				 	int num_stripes;
			
 
				+	int mirror_num;
			
 
				 	struct btrfs_bio_stripe stripes[];
			
 
				 };
			
 
				 
			
@@ -171,7 +183,7 @@ struct map_lookup {
 
				 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
			
 
				 				   u64 end, u64 *length);
			
 
				 
			
 
				-#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
			
 
				+#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
			
 
				 			    (sizeof(struct btrfs_bio_stripe) * (n)))
			
 
				 
			
 
				 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
			
@@ -180,7 +192,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 
				 			   u64 chunk_offset, u64 start, u64 num_bytes);
			
 
				 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
			
 
				 		    u64 logical, u64 *length,
			
 
				-		    struct btrfs_multi_bio **multi_ret, int mirror_num);
			
 
				+		    struct btrfs_bio **bbio_ret, int mirror_num);
			
 
				 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
			
 
				 		     u64 chunk_start, u64 physical, u64 devid,
			
 
				 		     u64 **logical, int *naddrs, int *stripe_len);
			
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 
				 again:
			
 
				 	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
			
 
				 				      name, name_len, value, size);
			
 
				+	/*
			
 
				+	 * If we're setting an xattr to a new value but the new value is say
			
 
				+	 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
			
 
				+	 * back from split_leaf.  This is because it thinks we'll be extending
			
 
				+	 * the existing item size, but we're asking for enough space to add the
			
 
				+	 * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
			
 
				+	 * the rest of the function figure it out.
			
 
				+	 */
			
 
				+	if (ret == -EOVERFLOW)
			
 
				+		ret = -EEXIST;
			
 
				+
			
 
				 	if (ret == -EEXIST) {
			
 
				 		if (flags & XATTR_CREATE)
			
 
				 			goto out;