13 anni fa · a22180d266
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 
				 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
			
 
				 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
			
 
				 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
			
 
				-	   reada.o backref.o ulist.o qgroup.o send.o
			
 
				+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
			
 
				 
			
 
				 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
			
 
				 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
			
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
 
				 			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
			
 
				 			if (ret < 0)
			
 
				 				return ret;
			
 
				+			if (ret == 0)
			
 
				+				acl = NULL;
			
 
				 		}
			
 
				 		ret = 0;
			
 
				 		break;
			
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
 
				 		     pos2 = n2, n2 = pos2->next) {
			
 
				 			struct __prelim_ref *ref2;
			
 
				 			struct __prelim_ref *xchg;
			
 
				+			struct extent_inode_elem *eie;
			
 
				 
			
 
				 			ref2 = list_entry(pos2, struct __prelim_ref, list);
			
 
				 
			
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
 
				 					ref1 = ref2;
			
 
				 					ref2 = xchg;
			
 
				 				}
			
 
				-				ref1->count += ref2->count;
			
 
				 			} else {
			
 
				 				if (ref1->parent != ref2->parent)
			
 
				 					continue;
			
 
				-				ref1->count += ref2->count;
			
 
				 			}
			
 
				+
			
 
				+			eie = ref1->inode_list;
			
 
				+			while (eie && eie->next)
			
 
				+				eie = eie->next;
			
 
				+			if (eie)
			
 
				+				eie->next = ref2->inode_list;
			
 
				+			else
			
 
				+				ref1->inode_list = ref2->inode_list;
			
 
				+			ref1->count += ref2->count;
			
 
				+
			
 
				 			list_del(&ref2->list);
			
 
				 			kfree(ref2);
			
 
				 		}
			
@@ -890,8 +899,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 
				 	while (!list_empty(&prefs)) {
			
 
				 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
			
 
				 		list_del(&ref->list);
			
 
				-		if (ref->count < 0)
			
 
				-			WARN_ON(1);
			
 
				+		WARN_ON(ref->count < 0);
			
 
				 		if (ref->count && ref->root_id && ref->parent == 0) {
			
 
				 			/* no parent == root of tree */
			
 
				 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
			
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
 
				 #define BTRFS_INODE_HAS_ORPHAN_ITEM		5
			
 
				 #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
			
 
				 #define BTRFS_INODE_NEEDS_FULL_SYNC		7
			
 
				+#define BTRFS_INODE_COPY_EVERYTHING		8
			
 
				 
			
 
				 /* in memory btrfs inode */
			
 
				 struct btrfs_inode {
			
@@ -90,6 +91,9 @@ struct btrfs_inode {
 
				 
			
 
				 	unsigned long runtime_flags;
			
 
				 
			
 
				+	/* Keep track of who's O_SYNC/fsycing currently */
			
 
				+	atomic_t sync_writers;
			
 
				+
			
 
				 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
			
 
				 	 * enough field for this.
			
 
				 	 */
			
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
 
				 	unsigned int never_written:1;	/* block was added because it was
			
 
				 					 * referenced, not because it was
			
 
				 					 * written */
			
 
				-	unsigned int mirror_num:2;	/* large enough to hold
			
 
				+	unsigned int mirror_num;	/* large enough to hold
			
 
				 					 * BTRFS_SUPER_MIRROR_MAX */
			
 
				 	struct btrfsic_dev_state *dev_state;
			
 
				 	u64 dev_bytenr;		/* key, physical byte num on disk */
			
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 
				 		}
			
 
				 
			
 
				 		num_copies =
			
 
				-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
			
 
				+		    btrfs_num_copies(state->root->fs_info,
			
 
				 				     next_bytenr, state->metablock_size);
			
 
				 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			
 
				 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
 
				 		}
			
 
				 
			
 
				 		num_copies =
			
 
				-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
			
 
				+		    btrfs_num_copies(state->root->fs_info,
			
 
				 				     next_bytenr, state->metablock_size);
			
 
				 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			
 
				 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
 
				 	*next_blockp = NULL;
			
 
				 	if (0 == *num_copiesp) {
			
 
				 		*num_copiesp =
			
 
				-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
			
 
				+		    btrfs_num_copies(state->root->fs_info,
			
 
				 				     next_bytenr, state->metablock_size);
			
 
				 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			
 
				 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
 
				 			chunk_len = num_bytes;
			
 
				 
			
 
				 		num_copies =
			
 
				-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
			
 
				+		    btrfs_num_copies(state->root->fs_info,
			
 
				 				     next_bytenr, state->datablock_size);
			
 
				 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			
 
				 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
 
				 	struct btrfs_device *device;
			
 
				 
			
 
				 	length = len;
			
 
				-	ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
			
 
				+	ret = btrfs_map_block(state->root->fs_info, READ,
			
 
				 			      bytenr, &length, &multi, mirror_num);
			
 
				 
			
 
				+	if (ret) {
			
 
				+		block_ctx_out->start = 0;
			
 
				+		block_ctx_out->dev_bytenr = 0;
			
 
				+		block_ctx_out->len = 0;
			
 
				+		block_ctx_out->dev = NULL;
			
 
				+		block_ctx_out->datav = NULL;
			
 
				+		block_ctx_out->pagev = NULL;
			
 
				+		block_ctx_out->mem_to_free = NULL;
			
 
				+
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				 	device = multi->stripes[0].dev;
			
 
				 	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
			
 
				 	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
			
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
 
				 	block_ctx_out->pagev = NULL;
			
 
				 	block_ctx_out->mem_to_free = NULL;
			
 
				 
			
 
				-	if (0 == ret)
			
 
				-		kfree(multi);
			
 
				+	kfree(multi);
			
 
				 	if (NULL == block_ctx_out->dev) {
			
 
				 		ret = -ENXIO;
			
 
				 		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
			
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
 
				 		}
			
 
				 
			
 
				 		num_copies =
			
 
				-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
			
 
				+		    btrfs_num_copies(state->root->fs_info,
			
 
				 				     next_bytenr, BTRFS_SUPER_INFO_SIZE);
			
 
				 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			
 
				 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 
				 	struct btrfsic_block_data_ctx block_ctx;
			
 
				 	int match = 0;
			
 
				 
			
 
				-	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
			
 
				+	num_copies = btrfs_num_copies(state->root->fs_info,
			
 
				 				      bytenr, state->metablock_size);
			
 
				 
			
 
				 	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
			
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
				 
			
 
				 			ret = btrfs_map_bio(root, READ, comp_bio,
			
 
				 					    mirror_num, 0);
			
 
				-			BUG_ON(ret); /* -ENOMEM */
			
 
				+			if (ret)
			
 
				+				bio_endio(comp_bio, ret);
			
 
				 
			
 
				 			bio_put(comp_bio);
			
 
				 
			
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
				 	}
			
 
				 
			
 
				 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
			
 
				-	BUG_ON(ret); /* -ENOMEM */
			
 
				+	if (ret)
			
 
				+		bio_endio(comp_bio, ret);
			
 
				 
			
 
				 	bio_put(comp_bio);
			
 
				 	return 0;
			
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 
				 			      struct extent_buffer *dst_buf,
			
 
				 			      struct extent_buffer *src_buf);
			
 
				 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
			
 
				-		    struct btrfs_path *path, int level, int slot,
			
 
				-		    int tree_mod_log);
			
 
				+		    struct btrfs_path *path, int level, int slot);
			
 
				 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
			
 
				 				 struct extent_buffer *eb);
			
 
				 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
			
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 
				 
			
 
				 static noinline void
			
 
				 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
			
 
				-			  struct extent_buffer *eb,
			
 
				-			  struct btrfs_disk_key *disk_key, int slot, int atomic)
			
 
				+			  struct extent_buffer *eb, int slot, int atomic)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
 
				 		switch (tm->op) {
			
 
				 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
			
 
				 			BUG_ON(tm->slot < n);
			
 
				-		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
			
 
				 		case MOD_LOG_KEY_REMOVE:
			
 
				+			n++;
			
 
				+		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
			
 
				 			btrfs_set_node_key(eb, &tm->key, tm->slot);
			
 
				 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
			
 
				 			btrfs_set_node_ptr_generation(eb, tm->slot,
			
 
				 						      tm->generation);
			
 
				-			n++;
			
 
				 			break;
			
 
				 		case MOD_LOG_KEY_REPLACE:
			
 
				 			BUG_ON(tm->slot >= n);
			
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 
				 	u64 search_start;
			
 
				 	int ret;
			
 
				 
			
 
				-	if (trans->transaction != root->fs_info->running_transaction) {
			
 
				-		printk(KERN_CRIT "trans %llu running %llu\n",
			
 
				+	if (trans->transaction != root->fs_info->running_transaction)
			
 
				+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
			
 
				 		       (unsigned long long)trans->transid,
			
 
				 		       (unsigned long long)
			
 
				 		       root->fs_info->running_transaction->transid);
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				-	if (trans->transid != root->fs_info->generation) {
			
 
				-		printk(KERN_CRIT "trans %llu running %llu\n",
			
 
				+
			
 
				+	if (trans->transid != root->fs_info->generation)
			
 
				+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
			
 
				 		       (unsigned long long)trans->transid,
			
 
				 		       (unsigned long long)root->fs_info->generation);
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				 
			
 
				 	if (!should_cow_block(trans, root, buf)) {
			
 
				 		*cow_ret = buf;
			
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
				 	if (cache_only && parent_level != 1)
			
 
				 		return 0;
			
 
				 
			
 
				-	if (trans->transaction != root->fs_info->running_transaction)
			
 
				-		WARN_ON(1);
			
 
				-	if (trans->transid != root->fs_info->generation)
			
 
				-		WARN_ON(1);
			
 
				+	WARN_ON(trans->transaction != root->fs_info->running_transaction);
			
 
				+	WARN_ON(trans->transid != root->fs_info->generation);
			
 
				 
			
 
				 	parent_nritems = btrfs_header_nritems(parent);
			
 
				 	blocksize = btrfs_level_size(root, parent_level - 1);
			
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
				 		if (btrfs_header_nritems(right) == 0) {
			
 
				 			clean_tree_block(trans, root, right);
			
 
				 			btrfs_tree_unlock(right);
			
 
				-			del_ptr(trans, root, path, level + 1, pslot + 1, 1);
			
 
				+			del_ptr(trans, root, path, level + 1, pslot + 1);
			
 
				 			root_sub_used(root, right->len);
			
 
				 			btrfs_free_tree_block(trans, root, right, 0, 1);
			
 
				 			free_extent_buffer_stale(right);
			
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
				 			struct btrfs_disk_key right_key;
			
 
				 			btrfs_node_key(right, &right_key, 0);
			
 
				 			tree_mod_log_set_node_key(root->fs_info, parent,
			
 
				-						  &right_key, pslot + 1, 0);
			
 
				+						  pslot + 1, 0);
			
 
				 			btrfs_set_node_key(parent, &right_key, pslot + 1);
			
 
				 			btrfs_mark_buffer_dirty(parent);
			
 
				 		}
			
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
				 	if (btrfs_header_nritems(mid) == 0) {
			
 
				 		clean_tree_block(trans, root, mid);
			
 
				 		btrfs_tree_unlock(mid);
			
 
				-		del_ptr(trans, root, path, level + 1, pslot, 1);
			
 
				+		del_ptr(trans, root, path, level + 1, pslot);
			
 
				 		root_sub_used(root, mid->len);
			
 
				 		btrfs_free_tree_block(trans, root, mid, 0, 1);
			
 
				 		free_extent_buffer_stale(mid);
			
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
				 		/* update the parent key to reflect our changes */
			
 
				 		struct btrfs_disk_key mid_key;
			
 
				 		btrfs_node_key(mid, &mid_key, 0);
			
 
				-		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
			
 
				+		tree_mod_log_set_node_key(root->fs_info, parent,
			
 
				 					  pslot, 0);
			
 
				 		btrfs_set_node_key(parent, &mid_key, pslot);
			
 
				 		btrfs_mark_buffer_dirty(parent);
			
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 
				 			orig_slot += left_nr;
			
 
				 			btrfs_node_key(mid, &disk_key, 0);
			
 
				 			tree_mod_log_set_node_key(root->fs_info, parent,
			
 
				-						  &disk_key, pslot, 0);
			
 
				+						  pslot, 0);
			
 
				 			btrfs_set_node_key(parent, &disk_key, pslot);
			
 
				 			btrfs_mark_buffer_dirty(parent);
			
 
				 			if (btrfs_header_nritems(left) > orig_slot) {
			
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 
				 
			
 
				 			btrfs_node_key(right, &disk_key, 0);
			
 
				 			tree_mod_log_set_node_key(root->fs_info, parent,
			
 
				-						  &disk_key, pslot + 1, 0);
			
 
				+						  pslot + 1, 0);
			
 
				 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			
 
				 			btrfs_mark_buffer_dirty(parent);
			
 
				 
			
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 
				 	int no_skips = 0;
			
 
				 	struct extent_buffer *t;
			
 
				 
			
 
				+	if (path->really_keep_locks)
			
 
				+		return;
			
 
				+
			
 
				 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
			
 
				 		if (!path->nodes[i])
			
 
				 			break;
			
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				-	if (path->keep_locks)
			
 
				+	if (path->keep_locks || path->really_keep_locks)
			
 
				 		return;
			
 
				 
			
 
				 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
			
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 
				 	if (!cow)
			
 
				 		write_lock_level = -1;
			
 
				 
			
 
				-	if (cow && (p->keep_locks || p->lowest_level))
			
 
				+	if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
			
 
				 		write_lock_level = BTRFS_MAX_LEVEL;
			
 
				 
			
 
				 	min_write_lock_level = write_lock_level;
			
@@ -2568,7 +2564,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 
				 			 * must have write locks on this node and the
			
 
				 			 * parent
			
 
				 			 */
			
 
				-			if (level + 1 > write_lock_level) {
			
 
				+			if (level > write_lock_level ||
			
 
				+			    (level + 1 > write_lock_level &&
			
 
				+			    level + 1 < BTRFS_MAX_LEVEL &&
			
 
				+			    p->nodes[level + 1])) {
			
 
				 				write_lock_level = level + 1;
			
 
				 				btrfs_release_path(p);
			
 
				 				goto again;
			
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
 
				 		if (!path->nodes[i])
			
 
				 			break;
			
 
				 		t = path->nodes[i];
			
 
				-		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
			
 
				+		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
			
 
				 		btrfs_set_node_key(t, key, tslot);
			
 
				 		btrfs_mark_buffer_dirty(path->nodes[i]);
			
 
				 		if (tslot != 0)
			
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
				  */
			
 
				 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
			
 
				 {
			
 
				+	struct btrfs_item *start_item;
			
 
				+	struct btrfs_item *end_item;
			
 
				+	struct btrfs_map_token token;
			
 
				 	int data_len;
			
 
				 	int nritems = btrfs_header_nritems(l);
			
 
				 	int end = min(nritems, start + nr) - 1;
			
 
				 
			
 
				 	if (!nr)
			
 
				 		return 0;
			
 
				-	data_len = btrfs_item_end_nr(l, start);
			
 
				-	data_len = data_len - btrfs_item_offset_nr(l, end);
			
 
				+	btrfs_init_map_token(&token);
			
 
				+	start_item = btrfs_item_nr(l, start);
			
 
				+	end_item = btrfs_item_nr(l, end);
			
 
				+	data_len = btrfs_token_item_offset(l, start_item, &token) +
			
 
				+		btrfs_token_item_size(l, start_item, &token);
			
 
				+	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
			
 
				 	data_len += sizeof(struct btrfs_item) * nr;
			
 
				 	WARN_ON(data_len < 0);
			
 
				 	return data_len;
			
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 
				 	if (push_items == 0)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				-	if (!empty && push_items == left_nritems)
			
 
				-		WARN_ON(1);
			
 
				+	WARN_ON(!empty && push_items == left_nritems);
			
 
				 
			
 
				 	/* push left to right */
			
 
				 	right_nritems = btrfs_header_nritems(right);
			
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 
				 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
			
 
				 
			
 
				 	/* fixup right node */
			
 
				-	if (push_items > right_nritems) {
			
 
				-		printk(KERN_CRIT "push items %d nr %u\n", push_items,
			
 
				+	if (push_items > right_nritems)
			
 
				+		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
			
 
				 		       right_nritems);
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				 
			
 
				 	if (push_items < right_nritems) {
			
 
				 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
			
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
				  * empty a node.
			
 
				  */
			
 
				 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
			
 
				-		    struct btrfs_path *path, int level, int slot,
			
 
				-		    int tree_mod_log)
			
 
				+		    struct btrfs_path *path, int level, int slot)
			
 
				 {
			
 
				 	struct extent_buffer *parent = path->nodes[level];
			
 
				 	u32 nritems;
			
 
				 	int ret;
			
 
				 
			
 
				+	if (level) {
			
 
				+		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
			
 
				+					      MOD_LOG_KEY_REMOVE);
			
 
				+		BUG_ON(ret < 0);
			
 
				+	}
			
 
				+
			
 
				 	nritems = btrfs_header_nritems(parent);
			
 
				 	if (slot != nritems - 1) {
			
 
				-		if (tree_mod_log && level)
			
 
				+		if (level)
			
 
				 			tree_mod_log_eb_move(root->fs_info, parent, slot,
			
 
				 					     slot + 1, nritems - slot - 1);
			
 
				 		memmove_extent_buffer(parent,
			
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
				 			      btrfs_node_key_ptr_offset(slot + 1),
			
 
				 			      sizeof(struct btrfs_key_ptr) *
			
 
				 			      (nritems - slot - 1));
			
 
				-	} else if (tree_mod_log && level) {
			
 
				-		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
			
 
				-					      MOD_LOG_KEY_REMOVE);
			
 
				-		BUG_ON(ret < 0);
			
 
				 	}
			
 
				 
			
 
				 	nritems--;
			
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
				 				    struct extent_buffer *leaf)
			
 
				 {
			
 
				 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
			
 
				-	del_ptr(trans, root, path, 1, path->slots[1], 1);
			
 
				+	del_ptr(trans, root, path, 1, path->slots[1]);
			
 
				 
			
 
				 	/*
			
 
				 	 * btrfs_free_extent is expensive, we want to make sure we
			
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 
				 	right_path->search_commit_root = 1;
			
 
				 	right_path->skip_locking = 1;
			
 
				 
			
 
				-	spin_lock(&left_root->root_times_lock);
			
 
				+	spin_lock(&left_root->root_item_lock);
			
 
				 	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
			
 
				-	spin_unlock(&left_root->root_times_lock);
			
 
				+	spin_unlock(&left_root->root_item_lock);
			
 
				 
			
 
				-	spin_lock(&right_root->root_times_lock);
			
 
				+	spin_lock(&right_root->root_item_lock);
			
 
				 	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
			
 
				-	spin_unlock(&right_root->root_times_lock);
			
 
				+	spin_unlock(&right_root->root_item_lock);
			
 
				 
			
 
				 	trans = btrfs_join_transaction(left_root);
			
 
				 	if (IS_ERR(trans)) {
			
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 
				 				goto out;
			
 
				 			}
			
 
				 
			
 
				-			spin_lock(&left_root->root_times_lock);
			
 
				+			spin_lock(&left_root->root_item_lock);
			
 
				 			ctransid = btrfs_root_ctransid(&left_root->root_item);
			
 
				-			spin_unlock(&left_root->root_times_lock);
			
 
				+			spin_unlock(&left_root->root_item_lock);
			
 
				 			if (ctransid != left_start_ctransid)
			
 
				 				left_start_ctransid = 0;
			
 
				 
			
 
				-			spin_lock(&right_root->root_times_lock);
			
 
				+			spin_lock(&right_root->root_item_lock);
			
 
				 			ctransid = btrfs_root_ctransid(&right_root->root_item);
			
 
				-			spin_unlock(&right_root->root_times_lock);
			
 
				+			spin_unlock(&right_root->root_item_lock);
			
 
				 			if (ctransid != right_start_ctransid)
			
 
				 				right_start_ctransid = 0;
			
 
				 
			
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
				 	return btrfs_next_old_leaf(root, path, 0);
			
 
				 }
			
 
				 
			
 
				+/* Release the path up to but not including the given level */
			
 
				+static void btrfs_release_level(struct btrfs_path *path, int level)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < level; i++) {
			
 
				+		path->slots[i] = 0;
			
 
				+		if (!path->nodes[i])
			
 
				+			continue;
			
 
				+		if (path->locks[i]) {
			
 
				+			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
			
 
				+			path->locks[i] = 0;
			
 
				+		}
			
 
				+		free_extent_buffer(path->nodes[i]);
			
 
				+		path->nodes[i] = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This function assumes 2 things
			
 
				+ *
			
 
				+ * 1) You are using path->keep_locks
			
 
				+ * 2) You are not inserting items.
			
 
				+ *
			
 
				+ * If either of these are not true do not use this function. If you need a next
			
 
				+ * leaf with either of these not being true then this function can be easily
			
 
				+ * adapted to do that, but at the moment these are the limitations.
			
 
				+ */
			
 
				+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
			
 
				+			  struct btrfs_root *root, struct btrfs_path *path,
			
 
				+			  int del)
			
 
				+{
			
 
				+	struct extent_buffer *b;
			
 
				+	struct btrfs_key key;
			
 
				+	u32 nritems;
			
 
				+	int level = 1;
			
 
				+	int slot;
			
 
				+	int ret = 1;
			
 
				+	int write_lock_level = BTRFS_MAX_LEVEL;
			
 
				+	int ins_len = del ? -1 : 0;
			
 
				+
			
 
				+	WARN_ON(!(path->keep_locks || path->really_keep_locks));
			
 
				+
			
 
				+	nritems = btrfs_header_nritems(path->nodes[0]);
			
 
				+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
			
 
				+
			
 
				+	while (path->nodes[level]) {
			
 
				+		nritems = btrfs_header_nritems(path->nodes[level]);
			
 
				+		if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
			
 
				+search:
			
 
				+			btrfs_release_path(path);
			
 
				+			ret = btrfs_search_slot(trans, root, &key, path,
			
 
				+						ins_len, 1);
			
 
				+			if (ret < 0)
			
 
				+				goto out;
			
 
				+			level = 1;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (path->slots[level] >= nritems - 1) {
			
 
				+			level++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		btrfs_release_level(path, level);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	if (!path->nodes[level]) {
			
 
				+		ret = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	path->slots[level]++;
			
 
				+	b = path->nodes[level];
			
 
				+
			
 
				+	while (b) {
			
 
				+		level = btrfs_header_level(b);
			
 
				+
			
 
				+		if (!should_cow_block(trans, root, b))
			
 
				+			goto cow_done;
			
 
				+
			
 
				+		btrfs_set_path_blocking(path);
			
 
				+		ret = btrfs_cow_block(trans, root, b,
			
 
				+				      path->nodes[level + 1],
			
 
				+				      path->slots[level + 1], &b);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+cow_done:
			
 
				+		path->nodes[level] = b;
			
 
				+		btrfs_clear_path_blocking(path, NULL, 0);
			
 
				+		if (level != 0) {
			
 
				+			ret = setup_nodes_for_search(trans, root, path, b,
			
 
				+						     level, ins_len,
			
 
				+						     &write_lock_level);
			
 
				+			if (ret == -EAGAIN)
			
 
				+				goto search;
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+
			
 
				+			b = path->nodes[level];
			
 
				+			slot = path->slots[level];
			
 
				+
			
 
				+			ret = read_block_for_search(trans, root, path,
			
 
				+						    &b, level, slot, &key, 0);
			
 
				+			if (ret == -EAGAIN)
			
 
				+				goto search;
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+			level = btrfs_header_level(b);
			
 
				+			if (!btrfs_try_tree_write_lock(b)) {
			
 
				+				btrfs_set_path_blocking(path);
			
 
				+				btrfs_tree_lock(b);
			
 
				+				btrfs_clear_path_blocking(path, b,
			
 
				+							  BTRFS_WRITE_LOCK);
			
 
				+			}
			
 
				+			path->locks[level] = BTRFS_WRITE_LOCK;
			
 
				+			path->nodes[level] = b;
			
 
				+			path->slots[level] = 0;
			
 
				+		} else {
			
 
				+			path->slots[level] = 0;
			
 
				+			ret = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	if (ret)
			
 
				+		btrfs_release_path(path);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
			
 
				 			u64 time_seq)
			
 
				 {
			
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
 
				 
			
 
				 #define BTRFS_MAGIC "_BHRfS_M"
			
 
				 
			
 
				-#define BTRFS_MAX_MIRRORS 2
			
 
				+#define BTRFS_MAX_MIRRORS 3
			
 
				 
			
 
				 #define BTRFS_MAX_LEVEL 8
			
 
				 
			
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
 
				 
			
 
				 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
			
 
				 
			
 
				+#define BTRFS_DEV_REPLACE_DEVID 0
			
 
				+
			
 
				 /*
			
 
				  * the max metadata block size.  This limit is somewhat artificial,
			
 
				  * but the memmove costs go through the roof for larger blocks.
			
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 
				 /* four bytes for CRC32 */
			
 
				 #define BTRFS_EMPTY_DIR_SIZE 0
			
 
				 
			
 
				+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
			
 
				+#define REQ_GET_READ_MIRRORS	(1 << 30)
			
 
				+
			
 
				 #define BTRFS_FT_UNKNOWN	0
			
 
				 #define BTRFS_FT_REG_FILE	1
			
 
				 #define BTRFS_FT_DIR		2
			
@@ -571,6 +576,7 @@ struct btrfs_path {
 
				 	unsigned int skip_locking:1;
			
 
				 	unsigned int leave_spinning:1;
			
 
				 	unsigned int search_commit_root:1;
			
 
				+	unsigned int really_keep_locks:1;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
 
				 	__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
			
 
				 } __attribute__ ((__packed__));
			
 
				 
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED	0
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED		1
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED		2
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED		3
			
 
				+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED		4
			
 
				+
			
 
				+struct btrfs_dev_replace {
			
 
				+	u64 replace_state;	/* see #define above */
			
 
				+	u64 time_started;	/* seconds since 1-Jan-1970 */
			
 
				+	u64 time_stopped;	/* seconds since 1-Jan-1970 */
			
 
				+	atomic64_t num_write_errors;
			
 
				+	atomic64_t num_uncorrectable_read_errors;
			
 
				+
			
 
				+	u64 cursor_left;
			
 
				+	u64 committed_cursor_left;
			
 
				+	u64 cursor_left_last_write_of_item;
			
 
				+	u64 cursor_right;
			
 
				+
			
 
				+	u64 cont_reading_from_srcdev_mode;	/* see #define above */
			
 
				+
			
 
				+	int is_valid;
			
 
				+	int item_needs_writeback;
			
 
				+	struct btrfs_device *srcdev;
			
 
				+	struct btrfs_device *tgtdev;
			
 
				+
			
 
				+	pid_t lock_owner;
			
 
				+	atomic_t nesting_level;
			
 
				+	struct mutex lock_finishing_cancel_unmount;
			
 
				+	struct mutex lock_management_lock;
			
 
				+	struct mutex lock;
			
 
				+
			
 
				+	struct btrfs_scrub_progress scrub_progress;
			
 
				+};
			
 
				+
			
 
				+struct btrfs_dev_replace_item {
			
 
				+	/*
			
 
				+	 * grow this item struct at the end for future enhancements and keep
			
 
				+	 * the existing values unchanged
			
 
				+	 */
			
 
				+	__le64 src_devid;
			
 
				+	__le64 cursor_left;
			
 
				+	__le64 cursor_right;
			
 
				+	__le64 cont_reading_from_srcdev_mode;
			
 
				+
			
 
				+	__le64 replace_state;
			
 
				+	__le64 time_started;
			
 
				+	__le64 time_stopped;
			
 
				+	__le64 num_write_errors;
			
 
				+	__le64 num_uncorrectable_read_errors;
			
 
				+} __attribute__ ((__packed__));
			
 
				+
			
 
				 /* different types of block groups (and chunks) */
			
 
				 #define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
			
 
				 #define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
			
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
 
				 	struct btrfs_workers generic_worker;
			
 
				 	struct btrfs_workers workers;
			
 
				 	struct btrfs_workers delalloc_workers;
			
 
				+	struct btrfs_workers flush_workers;
			
 
				 	struct btrfs_workers endio_workers;
			
 
				 	struct btrfs_workers endio_meta_workers;
			
 
				 	struct btrfs_workers endio_meta_write_workers;
			
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
 
				 	struct rw_semaphore scrub_super_lock;
			
 
				 	int scrub_workers_refcnt;
			
 
				 	struct btrfs_workers scrub_workers;
			
 
				+	struct btrfs_workers scrub_wr_completion_workers;
			
 
				+	struct btrfs_workers scrub_nocow_workers;
			
 
				 
			
 
				 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
			
 
				 	u32 check_integrity_print_mask;
			
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
 
				 	int backup_root_index;
			
 
				 
			
 
				 	int num_tolerated_disk_barrier_failures;
			
 
				+
			
 
				+	/* device replace state */
			
 
				+	struct btrfs_dev_replace dev_replace;
			
 
				+
			
 
				+	atomic_t mutually_exclusive_operation_running;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -1579,7 +1646,7 @@ struct btrfs_root {
 
				 
			
 
				 	int force_cow;
			
 
				 
			
 
				-	spinlock_t root_times_lock;
			
 
				+	spinlock_t root_item_lock;
			
 
				 };
			
 
				 
			
 
				 struct btrfs_ioctl_defrag_range_args {
			
@@ -1722,6 +1789,12 @@ struct btrfs_ioctl_defrag_range_args {
 
				  */
			
 
				 #define BTRFS_DEV_STATS_KEY	249
			
 
				 
			
 
				+/*
			
 
				+ * Persistantly stores the device replace state in the device tree.
			
 
				+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
			
 
				+ */
			
 
				+#define BTRFS_DEV_REPLACE_KEY	250
			
 
				+
			
 
				 /*
			
 
				  * string items are for debugging.  They just store a short string of
			
 
				  * data in the FS
			
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
 
				 
			
 
				 static inline void btrfs_init_map_token (struct btrfs_map_token *token)
			
 
				 {
			
 
				-	memset(token, 0, sizeof(*token));
			
 
				+	token->kaddr = NULL;
			
 
				 }
			
 
				 
			
 
				 /* some macros to generate set/get funcs for the struct fields.  This
			
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
 
				 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
			
 
				 		   rsv_excl, 64);
			
 
				 
			
 
				+/* btrfs_dev_replace_item */
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
			
 
				+		   struct btrfs_dev_replace_item, src_devid, 64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
			
 
				+		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
			
 
				+		   64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
			
 
				+		   replace_state, 64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
			
 
				+		   time_started, 64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
			
 
				+		   time_stopped, 64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
			
 
				+		   num_write_errors, 64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
			
 
				+		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
			
 
				+		   64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
			
 
				+		   cursor_left, 64);
			
 
				+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
			
 
				+		   cursor_right, 64);
			
 
				+
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
			
 
				+			 struct btrfs_dev_replace_item, src_devid, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
			
 
				+			 struct btrfs_dev_replace_item,
			
 
				+			 cont_reading_from_srcdev_mode, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
			
 
				+			 struct btrfs_dev_replace_item, replace_state, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
			
 
				+			 struct btrfs_dev_replace_item, time_started, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
			
 
				+			 struct btrfs_dev_replace_item, time_stopped, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
			
 
				+			 struct btrfs_dev_replace_item, num_write_errors, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
			
 
				+			 struct btrfs_dev_replace_item,
			
 
				+			 num_uncorrectable_read_errors, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
			
 
				+			 struct btrfs_dev_replace_item, cursor_left, 64);
			
 
				+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
			
 
				+			 struct btrfs_dev_replace_item, cursor_right, 64);
			
 
				+
			
 
				 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
			
 
				 {
			
 
				 	return sb->s_fs_info;
			
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 
				 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
			
 
				 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
			
 
				 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
			
 
				+
			
 
				+enum btrfs_reserve_flush_enum {
			
 
				+	/* If we are in the transaction, we can't flush anything.*/
			
 
				+	BTRFS_RESERVE_NO_FLUSH,
			
 
				+	/*
			
 
				+	 * Flushing delalloc may cause deadlock somewhere, in this
			
 
				+	 * case, use FLUSH LIMIT
			
 
				+	 */
			
 
				+	BTRFS_RESERVE_FLUSH_LIMIT,
			
 
				+	BTRFS_RESERVE_FLUSH_ALL,
			
 
				+};
			
 
				+
			
 
				 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
			
 
				 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
			
 
				 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
			
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 
				 void btrfs_free_block_rsv(struct btrfs_root *root,
			
 
				 			  struct btrfs_block_rsv *rsv);
			
 
				 int btrfs_block_rsv_add(struct btrfs_root *root,
			
 
				-			struct btrfs_block_rsv *block_rsv,
			
 
				-			u64 num_bytes);
			
 
				-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
			
 
				-				struct btrfs_block_rsv *block_rsv,
			
 
				-				u64 num_bytes);
			
 
				+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
			
 
				+			enum btrfs_reserve_flush_enum flush);
			
 
				 int btrfs_block_rsv_check(struct btrfs_root *root,
			
 
				 			  struct btrfs_block_rsv *block_rsv, int min_factor);
			
 
				 int btrfs_block_rsv_refill(struct btrfs_root *root,
			
 
				-			  struct btrfs_block_rsv *block_rsv,
			
 
				-			  u64 min_reserved);
			
 
				-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
			
 
				-				   struct btrfs_block_rsv *block_rsv,
			
 
				-				   u64 min_reserved);
			
 
				+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
			
 
				+			   enum btrfs_reserve_flush_enum flush);
			
 
				 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
			
 
				 			    struct btrfs_block_rsv *dst_rsv,
			
 
				 			    u64 num_bytes);
			
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 
				 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
			
 
				 					 struct btrfs_fs_info *fs_info);
			
 
				+int __get_raid_index(u64 flags);
			
 
				 /* ctree.c */
			
 
				 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
			
 
				 		     int level, int *slot);
			
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 
				 }
			
 
				 
			
 
				 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
			
 
				+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
			
 
				+			  struct btrfs_root *root, struct btrfs_path *path,
			
 
				+			  int del);
			
 
				 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
			
 
				 			u64 time_seq);
			
 
				 static inline int btrfs_next_old_item(struct btrfs_root *root,
			
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 
				 			     struct btrfs_root *root);
			
 
				 
			
 
				 /* dir-item.c */
			
 
				+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
			
 
				+			  const char *name, int name_len);
			
 
				 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
			
 
				 			  struct btrfs_root *root, const char *name,
			
 
				 			  int name_len, struct inode *dir,
			
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 
				 			     struct btrfs_root *root,
			
 
				 			     struct btrfs_path *path, u64 objectid,
			
 
				 			     u64 bytenr, int mod);
			
 
				+u64 btrfs_file_extent_length(struct btrfs_path *path);
			
 
				 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
			
 
				 			   struct btrfs_root *root,
			
 
				 			   struct btrfs_ordered_sum *sums);
			
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 
				 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
			
 
				 			     struct list_head *list, int search_commit);
			
 
				 /* inode.c */
			
 
				+struct btrfs_delalloc_work {
			
 
				+	struct inode *inode;
			
 
				+	int wait;
			
 
				+	int delay_iput;
			
 
				+	struct completion completion;
			
 
				+	struct list_head list;
			
 
				+	struct btrfs_work work;
			
 
				+};
			
 
				+
			
 
				+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
			
 
				+						    int wait, int delay_iput);
			
 
				+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
			
 
				+
			
 
				 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
			
 
				 					   size_t pg_offset, u64 start, u64 len,
			
 
				 					   int create);
			
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
 
				 				struct btrfs_ioctl_space_info *space);
			
 
				 
			
 
				 /* file.c */
			
 
				+int btrfs_auto_defrag_init(void);
			
 
				+void btrfs_auto_defrag_exit(void);
			
 
				 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
			
 
				 			   struct inode *inode);
			
 
				 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
			
 
				+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
			
 
				 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
			
 
				 			     int skip_pinned);
			
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 
				 			      struct btrfs_pending_snapshot *pending);
			
 
				 
			
 
				 /* scrub.c */
			
 
				-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
			
 
				-		    struct btrfs_scrub_progress *progress, int readonly);
			
 
				+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
			
 
				+		    u64 end, struct btrfs_scrub_progress *progress,
			
 
				+		    int readonly, int is_dev_replace);
			
 
				 void btrfs_scrub_pause(struct btrfs_root *root);
			
 
				 void btrfs_scrub_pause_super(struct btrfs_root *root);
			
 
				 void btrfs_scrub_continue(struct btrfs_root *root);
			
 
				 void btrfs_scrub_continue_super(struct btrfs_root *root);
			
 
				-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
			
 
				-int btrfs_scrub_cancel(struct btrfs_root *root);
			
 
				-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
			
 
				+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
			
 
				+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
			
 
				+			   struct btrfs_device *dev);
			
 
				 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
			
 
				 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
			
 
				 			 struct btrfs_scrub_progress *progress);
			
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 
				 	 */
			
 
				 	if (!src_rsv || (!trans->bytes_reserved &&
			
 
				 			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
			
 
				-		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
			
 
				+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
			
 
				+					  BTRFS_RESERVE_NO_FLUSH);
			
 
				 		/*
			
 
				 		 * Since we're under a transaction reserve_metadata_bytes could
			
 
				 		 * try to commit the transaction which will make it return
			
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 
				 		 * reserve something strictly for us.  If not be a pain and try
			
 
				 		 * to steal from the delalloc block rsv.
			
 
				 		 */
			
 
				-		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
			
 
				+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
			
 
				+					  BTRFS_RESERVE_NO_FLUSH);
			
 
				 		if (!ret)
			
 
				 			goto out;
			
 
				 
			
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 
				 	struct btrfs_delayed_node *delayed_node = NULL;
			
 
				 	struct btrfs_root *root;
			
 
				 	struct btrfs_block_rsv *block_rsv;
			
 
				-	unsigned long nr = 0;
			
 
				 	int need_requeue = 0;
			
 
				 	int ret;
			
 
				 
			
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 
				 					   delayed_node);
			
 
				 	mutex_unlock(&delayed_node->mutex);
			
 
				 
			
 
				-	nr = trans->blocks_used;
			
 
				-
			
 
				 	trans->block_rsv = block_rsv;
			
 
				 	btrfs_end_transaction_dmeta(trans, root);
			
 
				-	__btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty_nodelay(root);
			
 
				 free_path:
			
 
				 	btrfs_free_path(path);
			
 
				 out:
			
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
 
				+/*
			
 
				+ * Copyright (C) STRATO AG 2012.  All rights reserved.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public
			
 
				+ * License v2 as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public
			
 
				+ * License along with this program; if not, write to the
			
 
				+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
			
 
				+ * Boston, MA 021110-1307, USA.
			
 
				+ */
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/bio.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/buffer_head.h>
			
 
				+#include <linux/blkdev.h>
			
 
				+#include <linux/random.h>
			
 
				+#include <linux/iocontext.h>
			
 
				+#include <linux/capability.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/math64.h>
			
 
				+#include <asm/div64.h>
			
 
				+#include "compat.h"
			
 
				+#include "ctree.h"
			
 
				+#include "extent_map.h"
			
 
				+#include "disk-io.h"
			
 
				+#include "transaction.h"
			
 
				+#include "print-tree.h"
			
 
				+#include "volumes.h"
			
 
				+#include "async-thread.h"
			
 
				+#include "check-integrity.h"
			
 
				+#include "rcu-string.h"
			
 
				+#include "dev-replace.h"
			
 
				+
			
 
				+static u64 btrfs_get_seconds_since_1970(void);
			
 
				+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
			
 
				+				       int scrub_ret);
			
 
				+static void btrfs_dev_replace_update_device_in_mapping_tree(
			
 
				+						struct btrfs_fs_info *fs_info,
			
 
				+						struct btrfs_device *srcdev,
			
 
				+						struct btrfs_device *tgtdev);
			
 
				+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
			
 
				+					 char *srcdev_name,
			
 
				+					 struct btrfs_device **device);
			
 
				+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
			
 
				+static int btrfs_dev_replace_kthread(void *data);
			
 
				+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
			
 
				+
			
 
				+
			
 
				+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_key key;
			
 
				+	struct btrfs_root *dev_root = fs_info->dev_root;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	struct extent_buffer *eb;
			
 
				+	int slot;
			
 
				+	int ret = 0;
			
 
				+	struct btrfs_path *path = NULL;
			
 
				+	int item_size;
			
 
				+	struct btrfs_dev_replace_item *ptr;
			
 
				+	u64 src_devid;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	key.objectid = 0;
			
 
				+	key.type = BTRFS_DEV_REPLACE_KEY;
			
 
				+	key.offset = 0;
			
 
				+	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
			
 
				+	if (ret) {
			
 
				+no_valid_dev_replace_entry_found:
			
 
				+		ret = 0;
			
 
				+		dev_replace->replace_state =
			
 
				+			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
			
 
				+		dev_replace->cont_reading_from_srcdev_mode =
			
 
				+		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
			
 
				+		dev_replace->replace_state = 0;
			
 
				+		dev_replace->time_started = 0;
			
 
				+		dev_replace->time_stopped = 0;
			
 
				+		atomic64_set(&dev_replace->num_write_errors, 0);
			
 
				+		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
			
 
				+		dev_replace->cursor_left = 0;
			
 
				+		dev_replace->committed_cursor_left = 0;
			
 
				+		dev_replace->cursor_left_last_write_of_item = 0;
			
 
				+		dev_replace->cursor_right = 0;
			
 
				+		dev_replace->srcdev = NULL;
			
 
				+		dev_replace->tgtdev = NULL;
			
 
				+		dev_replace->is_valid = 0;
			
 
				+		dev_replace->item_needs_writeback = 0;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	slot = path->slots[0];
			
 
				+	eb = path->nodes[0];
			
 
				+	item_size = btrfs_item_size_nr(eb, slot);
			
 
				+	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
			
 
				+
			
 
				+	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
			
 
				+		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
			
 
				+		goto no_valid_dev_replace_entry_found;
			
 
				+	}
			
 
				+
			
 
				+	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
			
 
				+	dev_replace->cont_reading_from_srcdev_mode =
			
 
				+		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
			
 
				+	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
			
 
				+	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
			
 
				+	dev_replace->time_stopped =
			
 
				+		btrfs_dev_replace_time_stopped(eb, ptr);
			
 
				+	atomic64_set(&dev_replace->num_write_errors,
			
 
				+		     btrfs_dev_replace_num_write_errors(eb, ptr));
			
 
				+	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
			
 
				+		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
			
 
				+	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
			
 
				+	dev_replace->committed_cursor_left = dev_replace->cursor_left;
			
 
				+	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
			
 
				+	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
			
 
				+	dev_replace->is_valid = 1;
			
 
				+
			
 
				+	dev_replace->item_needs_writeback = 0;
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+		dev_replace->srcdev = NULL;
			
 
				+		dev_replace->tgtdev = NULL;
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
			
 
				+							NULL, NULL);
			
 
				+		dev_replace->tgtdev = btrfs_find_device(fs_info,
			
 
				+							BTRFS_DEV_REPLACE_DEVID,
			
 
				+							NULL, NULL);
			
 
				+		/*
			
 
				+		 * allow 'btrfs dev replace_cancel' if src/tgt device is
			
 
				+		 * missing
			
 
				+		 */
			
 
				+		if (!dev_replace->srcdev &&
			
 
				+		    !btrfs_test_opt(dev_root, DEGRADED)) {
			
 
				+			ret = -EIO;
			
 
				+			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
			
 
				+				(unsigned long long)src_devid);
			
 
				+		}
			
 
				+		if (!dev_replace->tgtdev &&
			
 
				+		    !btrfs_test_opt(dev_root, DEGRADED)) {
			
 
				+			ret = -EIO;
			
 
				+			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
			
 
				+				(unsigned long long)BTRFS_DEV_REPLACE_DEVID);
			
 
				+		}
			
 
				+		if (dev_replace->tgtdev) {
			
 
				+			if (dev_replace->srcdev) {
			
 
				+				dev_replace->tgtdev->total_bytes =
			
 
				+					dev_replace->srcdev->total_bytes;
			
 
				+				dev_replace->tgtdev->disk_total_bytes =
			
 
				+					dev_replace->srcdev->disk_total_bytes;
			
 
				+				dev_replace->tgtdev->bytes_used =
			
 
				+					dev_replace->srcdev->bytes_used;
			
 
				+			}
			
 
				+			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
			
 
				+			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
			
 
				+				dev_replace->tgtdev);
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	if (path)
			
 
				+		btrfs_free_path(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called from commit_transaction. Writes changed device replace state to
			
 
				+ * disk.
			
 
				+ */
			
 
				+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
			
 
				+			  struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct btrfs_root *dev_root = fs_info->dev_root;
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_key key;
			
 
				+	struct extent_buffer *eb;
			
 
				+	struct btrfs_dev_replace_item *ptr;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	if (!dev_replace->is_valid ||
			
 
				+	    !dev_replace->item_needs_writeback) {
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+
			
 
				+	key.objectid = 0;
			
 
				+	key.type = BTRFS_DEV_REPLACE_KEY;
			
 
				+	key.offset = 0;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
			
 
				+	if (ret < 0) {
			
 
				+		pr_warn("btrfs: error %d while searching for dev_replace item!\n",
			
 
				+			ret);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (ret == 0 &&
			
 
				+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
			
 
				+		/*
			
 
				+		 * need to delete old one and insert a new one.
			
 
				+		 * Since no attempt is made to recover any old state, if the
			
 
				+		 * dev_replace state is 'running', the data on the target
			
 
				+		 * drive is lost.
			
 
				+		 * It would be possible to recover the state: just make sure
			
 
				+		 * that the beginning of the item is never changed and always
			
 
				+		 * contains all the essential information. Then read this
			
 
				+		 * minimal set of information and use it as a base for the
			
 
				+		 * new state.
			
 
				+		 */
			
 
				+		ret = btrfs_del_item(trans, dev_root, path);
			
 
				+		if (ret != 0) {
			
 
				+			pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
			
 
				+				ret);
			
 
				+			goto out;
			
 
				+		}
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (ret == 1) {
			
 
				+		/* need to insert a new item */
			
 
				+		btrfs_release_path(path);
			
 
				+		ret = btrfs_insert_empty_item(trans, dev_root, path,
			
 
				+					      &key, sizeof(*ptr));
			
 
				+		if (ret < 0) {
			
 
				+			pr_warn("btrfs: insert dev_replace item failed %d!\n",
			
 
				+				ret);
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	eb = path->nodes[0];
			
 
				+	ptr = btrfs_item_ptr(eb, path->slots[0],
			
 
				+			     struct btrfs_dev_replace_item);
			
 
				+
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	if (dev_replace->srcdev)
			
 
				+		btrfs_set_dev_replace_src_devid(eb, ptr,
			
 
				+			dev_replace->srcdev->devid);
			
 
				+	else
			
 
				+		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
			
 
				+	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
			
 
				+		dev_replace->cont_reading_from_srcdev_mode);
			
 
				+	btrfs_set_dev_replace_replace_state(eb, ptr,
			
 
				+		dev_replace->replace_state);
			
 
				+	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
			
 
				+	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
			
 
				+	btrfs_set_dev_replace_num_write_errors(eb, ptr,
			
 
				+		atomic64_read(&dev_replace->num_write_errors));
			
 
				+	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
			
 
				+		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
			
 
				+	dev_replace->cursor_left_last_write_of_item =
			
 
				+		dev_replace->cursor_left;
			
 
				+	btrfs_set_dev_replace_cursor_left(eb, ptr,
			
 
				+		dev_replace->cursor_left_last_write_of_item);
			
 
				+	btrfs_set_dev_replace_cursor_right(eb, ptr,
			
 
				+		dev_replace->cursor_right);
			
 
				+	dev_replace->item_needs_writeback = 0;
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+
			
 
				+	btrfs_mark_buffer_dirty(eb);
			
 
				+
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+
			
 
				+	dev_replace->committed_cursor_left =
			
 
				+		dev_replace->cursor_left_last_write_of_item;
			
 
				+}
			
 
				+
			
 
				+static u64 btrfs_get_seconds_since_1970(void)
			
 
				+{
			
 
				+	struct timespec t = CURRENT_TIME_SEC;
			
 
				+
			
 
				+	return t.tv_sec;
			
 
				+}
			
 
				+
			
 
				+int btrfs_dev_replace_start(struct btrfs_root *root,
			
 
				+			    struct btrfs_ioctl_dev_replace_args *args)
			
 
				+{
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	int ret;
			
 
				+	struct btrfs_device *tgt_device = NULL;
			
 
				+	struct btrfs_device *src_device = NULL;
			
 
				+
			
 
				+	switch (args->start.cont_reading_from_srcdev_mode) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
			
 
				+		break;
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
			
 
				+	    args->start.tgtdev_name[0] == '\0')
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	mutex_lock(&fs_info->volume_mutex);
			
 
				+	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
			
 
				+					    &tgt_device);
			
 
				+	if (ret) {
			
 
				+		pr_err("btrfs: target device %s is invalid!\n",
			
 
				+		       args->start.tgtdev_name);
			
 
				+		mutex_unlock(&fs_info->volume_mutex);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
			
 
				+					    args->start.srcdev_name,
			
 
				+					    &src_device);
			
 
				+	mutex_unlock(&fs_info->volume_mutex);
			
 
				+	if (ret) {
			
 
				+		ret = -EINVAL;
			
 
				+		goto leave_no_lock;
			
 
				+	}
			
 
				+
			
 
				+	if (tgt_device->total_bytes < src_device->total_bytes) {
			
 
				+		pr_err("btrfs: target device is smaller than source device!\n");
			
 
				+		ret = -EINVAL;
			
 
				+		goto leave_no_lock;
			
 
				+	}
			
 
				+
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
			
 
				+		goto leave;
			
 
				+	}
			
 
				+
			
 
				+	dev_replace->cont_reading_from_srcdev_mode =
			
 
				+		args->start.cont_reading_from_srcdev_mode;
			
 
				+	WARN_ON(!src_device);
			
 
				+	dev_replace->srcdev = src_device;
			
 
				+	WARN_ON(!tgt_device);
			
 
				+	dev_replace->tgtdev = tgt_device;
			
 
				+
			
 
				+	printk_in_rcu(KERN_INFO
			
 
				+		      "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
			
 
				+		      src_device->missing ? "<missing disk>" :
			
 
				+		        rcu_str_deref(src_device->name),
			
 
				+		      src_device->devid,
			
 
				+		      rcu_str_deref(tgt_device->name));
			
 
				+
			
 
				+	tgt_device->total_bytes = src_device->total_bytes;
			
 
				+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
			
 
				+	tgt_device->bytes_used = src_device->bytes_used;
			
 
				+
			
 
				+	/*
			
 
				+	 * from now on, the writes to the srcdev are all duplicated to
			
 
				+	 * go to the tgtdev as well (refer to btrfs_map_block()).
			
 
				+	 */
			
 
				+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
			
 
				+	dev_replace->time_started = btrfs_get_seconds_since_1970();
			
 
				+	dev_replace->cursor_left = 0;
			
 
				+	dev_replace->committed_cursor_left = 0;
			
 
				+	dev_replace->cursor_left_last_write_of_item = 0;
			
 
				+	dev_replace->cursor_right = 0;
			
 
				+	dev_replace->is_valid = 1;
			
 
				+	dev_replace->item_needs_writeback = 1;
			
 
				+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+
			
 
				+	btrfs_wait_ordered_extents(root, 0);
			
 
				+
			
 
				+	/* force writing the updated state information to disk */
			
 
				+	trans = btrfs_start_transaction(root, 0);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		ret = PTR_ERR(trans);
			
 
				+		btrfs_dev_replace_lock(dev_replace);
			
 
				+		goto leave;
			
 
				+	}
			
 
				+
			
 
				+	ret = btrfs_commit_transaction(trans, root);
			
 
				+	WARN_ON(ret);
			
 
				+
			
 
				+	/* the disk copy procedure reuses the scrub code */
			
 
				+	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
			
 
				+			      src_device->total_bytes,
			
 
				+			      &dev_replace->scrub_progress, 0, 1);
			
 
				+
			
 
				+	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
			
 
				+	WARN_ON(ret);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+leave:
			
 
				+	dev_replace->srcdev = NULL;
			
 
				+	dev_replace->tgtdev = NULL;
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+leave_no_lock:
			
 
				+	if (tgt_device)
			
 
				+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
			
 
				+				       int scrub_ret)
			
 
				+{
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	struct btrfs_device *tgt_device;
			
 
				+	struct btrfs_device *src_device;
			
 
				+	struct btrfs_root *root = fs_info->tree_root;
			
 
				+	u8 uuid_tmp[BTRFS_UUID_SIZE];
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	/* don't allow cancel or unmount to disturb the finishing procedure */
			
 
				+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	/* was the operation canceled, or is it finished? */
			
 
				+	if (dev_replace->replace_state !=
			
 
				+	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	tgt_device = dev_replace->tgtdev;
			
 
				+	src_device = dev_replace->srcdev;
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+
			
 
				+	/* replace old device with new one in mapping tree */
			
 
				+	if (!scrub_ret)
			
 
				+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
			
 
				+								src_device,
			
 
				+								tgt_device);
			
 
				+
			
 
				+	/*
			
 
				+	 * flush all outstanding I/O and inode extent mappings before the
			
 
				+	 * copy operation is declared as being finished
			
 
				+	 */
			
 
				+	btrfs_start_delalloc_inodes(root, 0);
			
 
				+	btrfs_wait_ordered_extents(root, 0);
			
 
				+
			
 
				+	trans = btrfs_start_transaction(root, 0);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+		return PTR_ERR(trans);
			
 
				+	}
			
 
				+	ret = btrfs_commit_transaction(trans, root);
			
 
				+	WARN_ON(ret);
			
 
				+
			
 
				+	/* keep away write_all_supers() during the finishing procedure */
			
 
				+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	dev_replace->replace_state =
			
 
				+		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
			
 
				+			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
			
 
				+	dev_replace->tgtdev = NULL;
			
 
				+	dev_replace->srcdev = NULL;
			
 
				+	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
			
 
				+	dev_replace->item_needs_writeback = 1;
			
 
				+
			
 
				+	if (scrub_ret) {
			
 
				+		printk_in_rcu(KERN_ERR
			
 
				+			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
			
 
				+			      src_device->missing ? "<missing disk>" :
			
 
				+			        rcu_str_deref(src_device->name),
			
 
				+			      src_device->devid,
			
 
				+			      rcu_str_deref(tgt_device->name), scrub_ret);
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				+		if (tgt_device)
			
 
				+			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
			
 
				+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	printk_in_rcu(KERN_INFO
			
 
				+		      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
			
 
				+		      src_device->missing ? "<missing disk>" :
			
 
				+		        rcu_str_deref(src_device->name),
			
 
				+		      src_device->devid,
			
 
				+		      rcu_str_deref(tgt_device->name));
			
 
				+	tgt_device->is_tgtdev_for_dev_replace = 0;
			
 
				+	tgt_device->devid = src_device->devid;
			
 
				+	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
			
 
				+	tgt_device->bytes_used = src_device->bytes_used;
			
 
				+	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
			
 
				+	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
			
 
				+	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
			
 
				+	tgt_device->total_bytes = src_device->total_bytes;
			
 
				+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
			
 
				+	tgt_device->bytes_used = src_device->bytes_used;
			
 
				+	if (fs_info->sb->s_bdev == src_device->bdev)
			
 
				+		fs_info->sb->s_bdev = tgt_device->bdev;
			
 
				+	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
			
 
				+		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
			
 
				+	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
			
 
				+
			
 
				+	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
			
 
				+	if (src_device->bdev) {
			
 
				+		/* zero out the old super */
			
 
				+		btrfs_scratch_superblock(src_device);
			
 
				+	}
			
 
				+	/*
			
 
				+	 * this is again a consistent state where no dev_replace procedure
			
 
				+	 * is running, the target device is part of the filesystem, the
			
 
				+	 * source device is not part of the filesystem anymore and its 1st
			
 
				+	 * superblock is scratched out so that it is no longer marked to
			
 
				+	 * belong to this filesystem.
			
 
				+	 */
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				+
			
 
				+	/* write back the superblocks */
			
 
				+	trans = btrfs_start_transaction(root, 0);
			
 
				+	if (!IS_ERR(trans))
			
 
				+		btrfs_commit_transaction(trans, root);
			
 
				+
			
 
				+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void btrfs_dev_replace_update_device_in_mapping_tree(
			
 
				+						struct btrfs_fs_info *fs_info,
			
 
				+						struct btrfs_device *srcdev,
			
 
				+						struct btrfs_device *tgtdev)
			
 
				+{
			
 
				+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
			
 
				+	struct extent_map *em;
			
 
				+	struct map_lookup *map;
			
 
				+	u64 start = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	write_lock(&em_tree->lock);
			
 
				+	do {
			
 
				+		em = lookup_extent_mapping(em_tree, start, (u64)-1);
			
 
				+		if (!em)
			
 
				+			break;
			
 
				+		map = (struct map_lookup *)em->bdev;
			
 
				+		for (i = 0; i < map->num_stripes; i++)
			
 
				+			if (srcdev == map->stripes[i].dev)
			
 
				+				map->stripes[i].dev = tgtdev;
			
 
				+		start = em->start + em->len;
			
 
				+		free_extent_map(em);
			
 
				+	} while (start);
			
 
				+	write_unlock(&em_tree->lock);
			
 
				+}
			
 
				+
			
 
				+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
			
 
				+					 char *srcdev_name,
			
 
				+					 struct btrfs_device **device)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (srcdevid) {
			
 
				+		ret = 0;
			
 
				+		*device = btrfs_find_device(root->fs_info, srcdevid, NULL,
			
 
				+					    NULL);
			
 
				+		if (!*device)
			
 
				+			ret = -ENOENT;
			
 
				+	} else {
			
 
				+		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
			
 
				+							   device);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
			
 
				+			      struct btrfs_ioctl_dev_replace_args *args)
			
 
				+{
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	/* even if !dev_replace_is_valid, the values are good enough for
			
 
				+	 * the replace_status ioctl */
			
 
				+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
			
 
				+	args->status.replace_state = dev_replace->replace_state;
			
 
				+	args->status.time_started = dev_replace->time_started;
			
 
				+	args->status.time_stopped = dev_replace->time_stopped;
			
 
				+	args->status.num_write_errors =
			
 
				+		atomic64_read(&dev_replace->num_write_errors);
			
 
				+	args->status.num_uncorrectable_read_errors =
			
 
				+		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+		args->status.progress_1000 = 0;
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+		args->status.progress_1000 = 1000;
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
			
 
				+			div64_u64(dev_replace->srcdev->total_bytes, 1000));
			
 
				+		break;
			
 
				+	}
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+}
			
 
				+
			
 
				+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
			
 
				+			     struct btrfs_ioctl_dev_replace_args *args)
			
 
				+{
			
 
				+	args->result = __btrfs_dev_replace_cancel(fs_info);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	struct btrfs_device *tgt_device = NULL;
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+	struct btrfs_root *root = fs_info->tree_root;
			
 
				+	u64 result;
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		goto leave;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
			
 
				+		tgt_device = dev_replace->tgtdev;
			
 
				+		dev_replace->tgtdev = NULL;
			
 
				+		dev_replace->srcdev = NULL;
			
 
				+		break;
			
 
				+	}
			
 
				+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
			
 
				+	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
			
 
				+	dev_replace->item_needs_writeback = 1;
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_scrub_cancel(fs_info);
			
 
				+
			
 
				+	trans = btrfs_start_transaction(root, 0);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+		return PTR_ERR(trans);
			
 
				+	}
			
 
				+	ret = btrfs_commit_transaction(trans, root);
			
 
				+	WARN_ON(ret);
			
 
				+	if (tgt_device)
			
 
				+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
			
 
				+
			
 
				+leave:
			
 
				+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+
			
 
				+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+		dev_replace->replace_state =
			
 
				+			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
			
 
				+		dev_replace->time_stopped = btrfs_get_seconds_since_1970();
			
 
				+		dev_replace->item_needs_writeback = 1;
			
 
				+		pr_info("btrfs: suspending dev_replace for unmount\n");
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				+}
			
 
				+
			
 
				+/* resume dev_replace procedure that was interrupted by unmount */
			
 
				+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct task_struct *task;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		return 0;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		dev_replace->replace_state =
			
 
				+			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
			
 
				+		break;
			
 
				+	}
			
 
				+	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
			
 
				+		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
			
 
				+			"btrfs: you may cancel the operation after 'mount -o degraded'\n");
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	btrfs_dev_replace_unlock(dev_replace);
			
 
				+
			
 
				+	WARN_ON(atomic_xchg(
			
 
				+		&fs_info->mutually_exclusive_operation_running, 1));
			
 
				+	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
			
 
				+	return PTR_RET(task);
			
 
				+}
			
 
				+
			
 
				+static int btrfs_dev_replace_kthread(void *data)
			
 
				+{
			
 
				+	struct btrfs_fs_info *fs_info = data;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	struct btrfs_ioctl_dev_replace_args *status_args;
			
 
				+	u64 progress;
			
 
				+
			
 
				+	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
			
 
				+	if (status_args) {
			
 
				+		btrfs_dev_replace_status(fs_info, status_args);
			
 
				+		progress = status_args->status.progress_1000;
			
 
				+		kfree(status_args);
			
 
				+		do_div(progress, 10);
			
 
				+		printk_in_rcu(KERN_INFO
			
 
				+			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
			
 
				+			      dev_replace->srcdev->missing ? "<missing disk>" :
			
 
				+				rcu_str_deref(dev_replace->srcdev->name),
			
 
				+			      dev_replace->srcdev->devid,
			
 
				+			      dev_replace->tgtdev ?
			
 
				+				rcu_str_deref(dev_replace->tgtdev->name) :
			
 
				+				"<missing target disk>",
			
 
				+			      (unsigned int)progress);
			
 
				+	}
			
 
				+	btrfs_dev_replace_continue_on_mount(fs_info);
			
 
				+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
			
 
				+			      dev_replace->committed_cursor_left,
			
 
				+			      dev_replace->srcdev->total_bytes,
			
 
				+			      &dev_replace->scrub_progress, 0, 1);
			
 
				+	ret = btrfs_dev_replace_finishing(fs_info, ret);
			
 
				+	WARN_ON(ret);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
			
 
				+{
			
 
				+	if (!dev_replace->is_valid)
			
 
				+		return 0;
			
 
				+
			
 
				+	switch (dev_replace->replace_state) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				+		return 0;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
 
				+		/*
			
 
				+		 * return true even if tgtdev is missing (this is
			
 
				+		 * something that can happen if the dev_replace
			
 
				+		 * procedure is suspended by an umount and then
			
 
				+		 * the tgtdev is missing (or "btrfs dev scan") was
			
 
				+		 * not called and the the filesystem is remounted
			
 
				+		 * in degraded state. This does not stop the
			
 
				+		 * dev_replace procedure. It needs to be canceled
			
 
				+		 * manually if the cancelation is wanted.
			
 
				+		 */
			
 
				+		break;
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
			
 
				+{
			
 
				+	/* the beginning is just an optimization for the typical case */
			
 
				+	if (atomic_read(&dev_replace->nesting_level) == 0) {
			
 
				+acquire_lock:
			
 
				+		/* this is not a nested case where the same thread
			
 
				+		 * is trying to acqurire the same lock twice */
			
 
				+		mutex_lock(&dev_replace->lock);
			
 
				+		mutex_lock(&dev_replace->lock_management_lock);
			
 
				+		dev_replace->lock_owner = current->pid;
			
 
				+		atomic_inc(&dev_replace->nesting_level);
			
 
				+		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	mutex_lock(&dev_replace->lock_management_lock);
			
 
				+	if (atomic_read(&dev_replace->nesting_level) > 0 &&
			
 
				+	    dev_replace->lock_owner == current->pid) {
			
 
				+		WARN_ON(!mutex_is_locked(&dev_replace->lock));
			
 
				+		atomic_inc(&dev_replace->nesting_level);
			
 
				+		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&dev_replace->lock_management_lock);
			
 
				+	goto acquire_lock;
			
 
				+}
			
 
				+
			
 
				+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
			
 
				+{
			
 
				+	WARN_ON(!mutex_is_locked(&dev_replace->lock));
			
 
				+	mutex_lock(&dev_replace->lock_management_lock);
			
 
				+	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
			
 
				+	WARN_ON(dev_replace->lock_owner != current->pid);
			
 
				+	atomic_dec(&dev_replace->nesting_level);
			
 
				+	if (atomic_read(&dev_replace->nesting_level) == 0) {
			
 
				+		dev_replace->lock_owner = 0;
			
 
				+		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				+		mutex_unlock(&dev_replace->lock);
			
 
				+	} else {
			
 
				+		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				+	}
			
 
				+}
			
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
 
				+/*
			
 
				+ * Copyright (C) STRATO AG 2012.  All rights reserved.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public
			
 
				+ * License v2 as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public
			
 
				+ * License along with this program; if not, write to the
			
 
				+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
			
 
				+ * Boston, MA 021110-1307, USA.
			
 
				+ */
			
 
				+
			
 
				+#if !defined(__BTRFS_DEV_REPLACE__)
			
 
				+#define __BTRFS_DEV_REPLACE__
			
 
				+
			
 
				+struct btrfs_ioctl_dev_replace_args;
			
 
				+
			
 
				+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
			
 
				+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
			
 
				+			  struct btrfs_fs_info *fs_info);
			
 
				+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
			
 
				+int btrfs_dev_replace_start(struct btrfs_root *root,
			
 
				+			    struct btrfs_ioctl_dev_replace_args *args);
			
 
				+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
			
 
				+			      struct btrfs_ioctl_dev_replace_args *args);
			
 
				+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
			
 
				+			     struct btrfs_ioctl_dev_replace_args *args);
			
 
				+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
			
 
				+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
			
 
				+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
			
 
				+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
			
 
				+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
			
 
				+
			
 
				+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
			
 
				+{
			
 
				+	atomic64_inc(stat_value);
			
 
				+}
			
 
				+#endif
			
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 
				 	return btrfs_match_dir_item_name(root, path, name, name_len);
			
 
				 }
			
 
				 
			
 
				+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
			
 
				+				   const char *name, int name_len)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct btrfs_key key;
			
 
				+	struct btrfs_dir_item *di;
			
 
				+	int data_size;
			
 
				+	struct extent_buffer *leaf;
			
 
				+	int slot;
			
 
				+	struct btrfs_path *path;
			
 
				+
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	key.objectid = dir;
			
 
				+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
			
 
				+	key.offset = btrfs_name_hash(name, name_len);
			
 
				+
			
 
				+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				+
			
 
				+	/* return back any errors */
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	/* nothing found, we're safe */
			
 
				+	if (ret > 0) {
			
 
				+		ret = 0;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* we found an item, look for our name in the item */
			
 
				+	di = btrfs_match_dir_item_name(root, path, name, name_len);
			
 
				+	if (di) {
			
 
				+		/* our exact name was found */
			
 
				+		ret = -EEXIST;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * see if there is room in the item to insert this
			
 
				+	 * name
			
 
				+	 */
			
 
				+	data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
			
 
				+	leaf = path->nodes[0];
			
 
				+	slot = path->slots[0];
			
 
				+	if (data_size + btrfs_item_size_nr(leaf, slot) +
			
 
				+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
			
 
				+		ret = -EOVERFLOW;
			
 
				+	} else {
			
 
				+		/* plenty of insertion room */
			
 
				+		ret = 0;
			
 
				+	}
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * lookup a directory item based on index.  'dir' is the objectid
			
 
				  * we're searching in, and 'mod' tells us if you plan on deleting the
			
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
 
				 #include "inode-map.h"
			
 
				 #include "check-integrity.h"
			
 
				 #include "rcu-string.h"
			
 
				+#include "dev-replace.h"
			
 
				 
			
 
				 #ifdef CONFIG_X86
			
 
				 #include <asm/cpufeature.h>
			
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 
				 		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
			
 
				 			break;
			
 
				 
			
 
				-		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
			
 
				+		num_copies = btrfs_num_copies(root->fs_info,
			
 
				 					      eb->start, eb->len);
			
 
				 		if (num_copies == 1)
			
 
				 			break;
			
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 
				 				 int mirror_num, unsigned long bio_flags,
			
 
				 				 u64 bio_offset)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/*
			
 
				 	 * when we're called for a write, we're already in the async
			
 
				 	 * submission context.  Just jump into btrfs_map_bio
			
 
				 	 */
			
 
				-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
			
 
				+	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
			
 
				+	if (ret)
			
 
				+		bio_endio(bio, ret);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static int check_async_write(struct inode *inode, unsigned long bio_flags)
			
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
				 	int ret;
			
 
				 
			
 
				 	if (!(rw & REQ_WRITE)) {
			
 
				-
			
 
				 		/*
			
 
				 		 * called for a read, do the setup so that checksum validation
			
 
				 		 * can happen in the async kernel threads
			
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
				 		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
			
 
				 					  bio, 1);
			
 
				 		if (ret)
			
 
				-			return ret;
			
 
				-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
			
 
				-				     mirror_num, 0);
			
 
				+			goto out_w_error;
			
 
				+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
			
 
				+				    mirror_num, 0);
			
 
				 	} else if (!async) {
			
 
				 		ret = btree_csum_one_bio(bio);
			
 
				 		if (ret)
			
 
				-			return ret;
			
 
				-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
			
 
				-				     mirror_num, 0);
			
 
				+			goto out_w_error;
			
 
				+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
			
 
				+				    mirror_num, 0);
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * kthread helpers are used to submit writes so that
			
 
				+		 * checksumming can happen in parallel across all CPUs
			
 
				+		 */
			
 
				+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
			
 
				+					  inode, rw, bio, mirror_num, 0,
			
 
				+					  bio_offset,
			
 
				+					  __btree_submit_bio_start,
			
 
				+					  __btree_submit_bio_done);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * kthread helpers are used to submit writes so that checksumming
			
 
				-	 * can happen in parallel across all CPUs
			
 
				-	 */
			
 
				-	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
			
 
				-				   inode, rw, bio, mirror_num, 0,
			
 
				-				   bio_offset,
			
 
				-				   __btree_submit_bio_start,
			
 
				-				   __btree_submit_bio_done);
			
 
				+	if (ret) {
			
 
				+out_w_error:
			
 
				+		bio_endio(bio, ret);
			
 
				+	}
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_MIGRATION
			
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 
				 
			
 
				 static int btree_set_page_dirty(struct page *page)
			
 
				 {
			
 
				+#ifdef DEBUG
			
 
				 	struct extent_buffer *eb;
			
 
				 
			
 
				 	BUG_ON(!PagePrivate(page));
			
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
 
				 	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
			
 
				 	BUG_ON(!atomic_read(&eb->refs));
			
 
				 	btrfs_assert_tree_locked(eb);
			
 
				+#endif
			
 
				 	return __set_page_dirty_nobuffers(page);
			
 
				 }
			
 
				 
			
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
				 					  root->fs_info->dirty_metadata_bytes);
			
 
				 			}
			
 
				 			spin_unlock(&root->fs_info->delalloc_lock);
			
 
				-		}
			
 
				 
			
 
				-		/* ugh, clear_extent_buffer_dirty needs to lock the page */
			
 
				-		btrfs_set_lock_blocking(buf);
			
 
				-		clear_extent_buffer_dirty(buf);
			
 
				+			/* ugh, clear_extent_buffer_dirty needs to lock the page */
			
 
				+			btrfs_set_lock_blocking(buf);
			
 
				+			clear_extent_buffer_dirty(buf);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
				 	root->root_key.objectid = objectid;
			
 
				 	root->anon_dev = 0;
			
 
				 
			
 
				-	spin_lock_init(&root->root_times_lock);
			
 
				+	spin_lock_init(&root->root_item_lock);
			
 
				 }
			
 
				 
			
 
				 static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
			
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
 
				 	init_rwsem(&fs_info->extent_commit_sem);
			
 
				 	init_rwsem(&fs_info->cleanup_work_sem);
			
 
				 	init_rwsem(&fs_info->subvol_sem);
			
 
				+	fs_info->dev_replace.lock_owner = 0;
			
 
				+	atomic_set(&fs_info->dev_replace.nesting_level, 0);
			
 
				+	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
			
 
				+	mutex_init(&fs_info->dev_replace.lock_management_lock);
			
 
				+	mutex_init(&fs_info->dev_replace.lock);
			
 
				 
			
 
				 	spin_lock_init(&fs_info->qgroup_lock);
			
 
				 	fs_info->qgroup_tree = RB_ROOT;
			
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
 
				 			   fs_info->thread_pool_size,
			
 
				 			   &fs_info->generic_worker);
			
 
				 
			
 
				+	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
			
 
				+			   fs_info->thread_pool_size,
			
 
				+			   &fs_info->generic_worker);
			
 
				+
			
 
				 	btrfs_init_workers(&fs_info->submit_workers, "submit",
			
 
				 			   min_t(u64, fs_devices->num_devices,
			
 
				 			   fs_info->thread_pool_size),
			
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
 
				 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
			
 
				 	ret |= btrfs_start_workers(&fs_info->caching_workers);
			
 
				 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
			
 
				+	ret |= btrfs_start_workers(&fs_info->flush_workers);
			
 
				 	if (ret) {
			
 
				 		err = -ENOMEM;
			
 
				 		goto fail_sb_buffer;
			
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
 
				 		goto fail_tree_roots;
			
 
				 	}
			
 
				 
			
 
				-	btrfs_close_extra_devices(fs_devices);
			
 
				+	/*
			
 
				+	 * keep the device that is marked to be the target device for the
			
 
				+	 * dev_replace procedure
			
 
				+	 */
			
 
				+	btrfs_close_extra_devices(fs_info, fs_devices, 0);
			
 
				 
			
 
				 	if (!fs_devices->latest_bdev) {
			
 
				 		printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
			
@@ -2490,6 +2517,14 @@ int open_ctree(struct super_block *sb,
 
				 		goto fail_block_groups;
			
 
				 	}
			
 
				 
			
 
				+	ret = btrfs_init_dev_replace(fs_info);
			
 
				+	if (ret) {
			
 
				+		pr_err("btrfs: failed to init dev_replace: %d\n", ret);
			
 
				+		goto fail_block_groups;
			
 
				+	}
			
 
				+
			
 
				+	btrfs_close_extra_devices(fs_info, fs_devices, 1);
			
 
				+
			
 
				 	ret = btrfs_init_space_info(fs_info);
			
 
				 	if (ret) {
			
 
				 		printk(KERN_ERR "Failed to initial space info: %d\n", ret);
			
@@ -2503,6 +2538,13 @@ int open_ctree(struct super_block *sb,
 
				 	}
			
 
				 	fs_info->num_tolerated_disk_barrier_failures =
			
 
				 		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
			
 
				+	if (fs_info->fs_devices->missing_devices >
			
 
				+	     fs_info->num_tolerated_disk_barrier_failures &&
			
 
				+	    !(sb->s_flags & MS_RDONLY)) {
			
 
				+		printk(KERN_WARNING
			
 
				+		       "Btrfs: too many missing devices, writeable mount is not allowed\n");
			
 
				+		goto fail_block_groups;
			
 
				+	}
			
 
				 
			
 
				 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
			
 
				 					       "btrfs-cleaner");
			
@@ -2631,6 +2673,13 @@ int open_ctree(struct super_block *sb,
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				+	ret = btrfs_resume_dev_replace_async(fs_info);
			
 
				+	if (ret) {
			
 
				+		pr_warn("btrfs: failed to resume dev_replace\n");
			
 
				+		close_ctree(tree_root);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 
			
 
				 fail_qgroup:
			
@@ -2667,6 +2716,7 @@ int open_ctree(struct super_block *sb,
 
				 	btrfs_stop_workers(&fs_info->submit_workers);
			
 
				 	btrfs_stop_workers(&fs_info->delayed_workers);
			
 
				 	btrfs_stop_workers(&fs_info->caching_workers);
			
 
				+	btrfs_stop_workers(&fs_info->flush_workers);
			
 
				 fail_alloc:
			
 
				 fail_iput:
			
 
				 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
			
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
 
				 	smp_mb();
			
 
				 
			
 
				 	/* pause restriper - we want to resume on mount */
			
 
				-	btrfs_pause_balance(root->fs_info);
			
 
				+	btrfs_pause_balance(fs_info);
			
 
				 
			
 
				-	btrfs_scrub_cancel(root);
			
 
				+	btrfs_dev_replace_suspend_for_unmount(fs_info);
			
 
				+
			
 
				+	btrfs_scrub_cancel(fs_info);
			
 
				 
			
 
				 	/* wait for any defraggers to finish */
			
 
				 	wait_event(fs_info->transaction_wait,
			
 
				 		   (atomic_read(&fs_info->defrag_running) == 0));
			
 
				 
			
 
				 	/* clear out the rbtree of defraggable inodes */
			
 
				-	btrfs_run_defrag_inodes(fs_info);
			
 
				+	btrfs_cleanup_defrag_inodes(fs_info);
			
 
				 
			
 
				 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
			
 
				 		ret = btrfs_commit_super(root);
			
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
 
				 	btrfs_stop_workers(&fs_info->delayed_workers);
			
 
				 	btrfs_stop_workers(&fs_info->caching_workers);
			
 
				 	btrfs_stop_workers(&fs_info->readahead_workers);
			
 
				+	btrfs_stop_workers(&fs_info->flush_workers);
			
 
				 
			
 
				 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
			
 
				 	if (btrfs_test_opt(root, CHECK_INTEGRITY))
			
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
				 	int was_dirty;
			
 
				 
			
 
				 	btrfs_assert_tree_locked(buf);
			
 
				-	if (transid != root->fs_info->generation) {
			
 
				-		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
			
 
				+	if (transid != root->fs_info->generation)
			
 
				+		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
			
 
				 		       "found %llu running %llu\n",
			
 
				 			(unsigned long long)buf->start,
			
 
				 			(unsigned long long)transid,
			
 
				 			(unsigned long long)root->fs_info->generation);
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				 	was_dirty = set_extent_buffer_dirty(buf);
			
 
				 	if (!was_dirty) {
			
 
				 		spin_lock(&root->fs_info->delalloc_lock);
			
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
			
 
				+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
			
 
				+					int flush_delayed)
			
 
				 {
			
 
				 	/*
			
 
				 	 * looks as though older kernels can get into trouble with
			
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 
				 	if (current->flags & PF_MEMALLOC)
			
 
				 		return;
			
 
				 
			
 
				-	btrfs_balance_delayed_items(root);
			
 
				+	if (flush_delayed)
			
 
				+		btrfs_balance_delayed_items(root);
			
 
				 
			
 
				 	num_dirty = root->fs_info->dirty_metadata_bytes;
			
 
				 
			
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
			
 
				+void btrfs_btree_balance_dirty(struct btrfs_root *root)
			
 
				 {
			
 
				-	/*
			
 
				-	 * looks as though older kernels can get into trouble with
			
 
				-	 * this code, they end up stuck in balance_dirty_pages forever
			
 
				-	 */
			
 
				-	u64 num_dirty;
			
 
				-	unsigned long thresh = 32 * 1024 * 1024;
			
 
				-
			
 
				-	if (current->flags & PF_MEMALLOC)
			
 
				-		return;
			
 
				-
			
 
				-	num_dirty = root->fs_info->dirty_metadata_bytes;
			
 
				+	__btrfs_btree_balance_dirty(root, 1);
			
 
				+}
			
 
				 
			
 
				-	if (num_dirty > thresh) {
			
 
				-		balance_dirty_pages_ratelimited(
			
 
				-				   root->fs_info->btree_inode->i_mapping);
			
 
				-	}
			
 
				-	return;
			
 
				+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
			
 
				+{
			
 
				+	__btrfs_btree_balance_dirty(root, 0);
			
 
				 }
			
 
				 
			
 
				 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
			
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 
				 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
			
 
				 					      struct btrfs_key *location);
			
 
				 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
			
 
				-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
			
 
				-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
			
 
				+void btrfs_btree_balance_dirty(struct btrfs_root *root);
			
 
				+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
			
 
				 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
			
 
				 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
			
 
				 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
			
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
 
				 #include "volumes.h"
			
 
				 #include "locking.h"
			
 
				 #include "free-space-cache.h"
			
 
				+#include "math.h"
			
 
				 
			
 
				 #undef SCRAMBLE_DELAYED_REFS
			
 
				 
			
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 
				 	rcu_read_unlock();
			
 
				 }
			
 
				 
			
 
				-static u64 div_factor(u64 num, int factor)
			
 
				-{
			
 
				-	if (factor == 10)
			
 
				-		return num;
			
 
				-	num *= factor;
			
 
				-	do_div(num, 10);
			
 
				-	return num;
			
 
				-}
			
 
				-
			
 
				-static u64 div_factor_fine(u64 num, int factor)
			
 
				-{
			
 
				-	if (factor == 100)
			
 
				-		return num;
			
 
				-	num *= factor;
			
 
				-	do_div(num, 100);
			
 
				-	return num;
			
 
				-}
			
 
				-
			
 
				 u64 btrfs_find_block_group(struct btrfs_root *root,
			
 
				 			   u64 search_start, u64 search_hint, int owner)
			
 
				 {
			
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
				 
			
 
				 
			
 
				 	/* Tell the block device(s) that the sectors can be discarded */
			
 
				-	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
			
 
				+	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
			
 
				 			      bytenr, &num_bytes, &bbio, 0);
			
 
				 	/* Error condition is -ENOMEM */
			
 
				 	if (!ret) {
			
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
				 				kfree(extent_op);
			
 
				 
			
 
				 				if (ret) {
			
 
				+					list_del_init(&locked_ref->cluster);
			
 
				+					mutex_unlock(&locked_ref->mutex);
			
 
				+
			
 
				 					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
			
 
				 					spin_lock(&delayed_refs->lock);
			
 
				 					return ret;
			
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
				 		count++;
			
 
				 
			
 
				 		if (ret) {
			
 
				+			if (locked_ref) {
			
 
				+				list_del_init(&locked_ref->cluster);
			
 
				+				mutex_unlock(&locked_ref->mutex);
			
 
				+			}
			
 
				 			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
			
 
				 			spin_lock(&delayed_refs->lock);
			
 
				 			return ret;
			
@@ -3661,7 +3651,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 
				 
			
 
				 static int can_overcommit(struct btrfs_root *root,
			
 
				 			  struct btrfs_space_info *space_info, u64 bytes,
			
 
				-			  int flush)
			
 
				+			  enum btrfs_reserve_flush_enum flush)
			
 
				 {
			
 
				 	u64 profile = btrfs_get_alloc_profile(root, 0);
			
 
				 	u64 avail;
			
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
 
				 		avail >>= 1;
			
 
				 
			
 
				 	/*
			
 
				-	 * If we aren't flushing don't let us overcommit too much, say
			
 
				-	 * 1/8th of the space.  If we can flush, let it overcommit up to
			
 
				-	 * 1/2 of the space.
			
 
				+	 * If we aren't flushing all things, let us overcommit up to
			
 
				+	 * 1/2th of the space. If we can flush, don't let us overcommit
			
 
				+	 * too much, let it overcommit up to 1/8 of the space.
			
 
				 	 */
			
 
				-	if (flush)
			
 
				+	if (flush == BTRFS_RESERVE_FLUSH_ALL)
			
 
				 		avail >>= 3;
			
 
				 	else
			
 
				 		avail >>= 1;
			
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
			
 
				+					       unsigned long nr_pages,
			
 
				+					       enum wb_reason reason)
			
 
				+{
			
 
				+	if (!writeback_in_progress(sb->s_bdi) &&
			
 
				+	    down_read_trylock(&sb->s_umount)) {
			
 
				+		writeback_inodes_sb_nr(sb, nr_pages, reason);
			
 
				+		up_read(&sb->s_umount);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * shrink metadata reservation for delalloc
			
 
				  */
			
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
				 	long time_left;
			
 
				 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
			
 
				 	int loops = 0;
			
 
				+	enum btrfs_reserve_flush_enum flush;
			
 
				 
			
 
				 	trans = (struct btrfs_trans_handle *)current->journal_info;
			
 
				 	block_rsv = &root->fs_info->delalloc_block_rsv;
			
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
				 	while (delalloc_bytes && loops < 3) {
			
 
				 		max_reclaim = min(delalloc_bytes, to_reclaim);
			
 
				 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
			
 
				-		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
			
 
				-					       WB_REASON_FS_FREE_SPACE);
			
 
				+		writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
			
 
				+						    nr_pages,
			
 
				+						    WB_REASON_FS_FREE_SPACE);
			
 
				 
			
 
				 		/*
			
 
				 		 * We need to wait for the async pages to actually start before
			
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
				 		wait_event(root->fs_info->async_submit_wait,
			
 
				 			   !atomic_read(&root->fs_info->async_delalloc_pages));
			
 
				 
			
 
				+		if (!trans)
			
 
				+			flush = BTRFS_RESERVE_FLUSH_ALL;
			
 
				+		else
			
 
				+			flush = BTRFS_RESERVE_NO_FLUSH;
			
 
				 		spin_lock(&space_info->lock);
			
 
				-		if (can_overcommit(root, space_info, orig, !trans)) {
			
 
				+		if (can_overcommit(root, space_info, orig, flush)) {
			
 
				 			spin_unlock(&space_info->lock);
			
 
				 			break;
			
 
				 		}
			
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
 
				  */
			
 
				 static int reserve_metadata_bytes(struct btrfs_root *root,
			
 
				 				  struct btrfs_block_rsv *block_rsv,
			
 
				-				  u64 orig_bytes, int flush)
			
 
				+				  u64 orig_bytes,
			
 
				+				  enum btrfs_reserve_flush_enum flush)
			
 
				 {
			
 
				 	struct btrfs_space_info *space_info = block_rsv->space_info;
			
 
				 	u64 used;
			
@@ -3912,10 +3923,11 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 
				 	ret = 0;
			
 
				 	spin_lock(&space_info->lock);
			
 
				 	/*
			
 
				-	 * We only want to wait if somebody other than us is flushing and we are
			
 
				-	 * actually alloed to flush.
			
 
				+	 * We only want to wait if somebody other than us is flushing and we
			
 
				+	 * are actually allowed to flush all things.
			
 
				 	 */
			
 
				-	while (flush && !flushing && space_info->flush) {
			
 
				+	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
			
 
				+	       space_info->flush) {
			
 
				 		spin_unlock(&space_info->lock);
			
 
				 		/*
			
 
				 		 * If we have a trans handle we can't wait because the flusher
			
@@ -3981,23 +3993,40 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 
				 	 * Couldn't make our reservation, save our place so while we're trying
			
 
				 	 * to reclaim space we can actually use it instead of somebody else
			
 
				 	 * stealing it from us.
			
 
				+	 *
			
 
				+	 * We make the other tasks wait for the flush only when we can flush
			
 
				+	 * all things.
			
 
				 	 */
			
 
				-	if (ret && flush) {
			
 
				+	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
			
 
				 		flushing = true;
			
 
				 		space_info->flush = 1;
			
 
				 	}
			
 
				 
			
 
				 	spin_unlock(&space_info->lock);
			
 
				 
			
 
				-	if (!ret || !flush)
			
 
				+	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
			
 
				 		goto out;
			
 
				 
			
 
				 	ret = flush_space(root, space_info, num_bytes, orig_bytes,
			
 
				 			  flush_state);
			
 
				 	flush_state++;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
			
 
				+	 * would happen. So skip delalloc flush.
			
 
				+	 */
			
 
				+	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
			
 
				+	    (flush_state == FLUSH_DELALLOC ||
			
 
				+	     flush_state == FLUSH_DELALLOC_WAIT))
			
 
				+		flush_state = ALLOC_CHUNK;
			
 
				+
			
 
				 	if (!ret)
			
 
				 		goto again;
			
 
				-	else if (flush_state <= COMMIT_TRANS)
			
 
				+	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
			
 
				+		 flush_state < COMMIT_TRANS)
			
 
				+		goto again;
			
 
				+	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
			
 
				+		 flush_state <= COMMIT_TRANS)
			
 
				 		goto again;
			
 
				 
			
 
				 out:
			
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 
				 	kfree(rsv);
			
 
				 }
			
 
				 
			
 
				-static inline int __block_rsv_add(struct btrfs_root *root,
			
 
				-				  struct btrfs_block_rsv *block_rsv,
			
 
				-				  u64 num_bytes, int flush)
			
 
				+int btrfs_block_rsv_add(struct btrfs_root *root,
			
 
				+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
			
 
				+			enum btrfs_reserve_flush_enum flush)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int btrfs_block_rsv_add(struct btrfs_root *root,
			
 
				-			struct btrfs_block_rsv *block_rsv,
			
 
				-			u64 num_bytes)
			
 
				-{
			
 
				-	return __block_rsv_add(root, block_rsv, num_bytes, 1);
			
 
				-}
			
 
				-
			
 
				-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
			
 
				-				struct btrfs_block_rsv *block_rsv,
			
 
				-				u64 num_bytes)
			
 
				-{
			
 
				-	return __block_rsv_add(root, block_rsv, num_bytes, 0);
			
 
				-}
			
 
				-
			
 
				 int btrfs_block_rsv_check(struct btrfs_root *root,
			
 
				 			  struct btrfs_block_rsv *block_rsv, int min_factor)
			
 
				 {
			
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
			
 
				-					   struct btrfs_block_rsv *block_rsv,
			
 
				-					   u64 min_reserved, int flush)
			
 
				+int btrfs_block_rsv_refill(struct btrfs_root *root,
			
 
				+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
			
 
				+			   enum btrfs_reserve_flush_enum flush)
			
 
				 {
			
 
				 	u64 num_bytes = 0;
			
 
				 	int ret = -ENOSPC;
			
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int btrfs_block_rsv_refill(struct btrfs_root *root,
			
 
				-			   struct btrfs_block_rsv *block_rsv,
			
 
				-			   u64 min_reserved)
			
 
				-{
			
 
				-	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
			
 
				-}
			
 
				-
			
 
				-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
			
 
				-				   struct btrfs_block_rsv *block_rsv,
			
 
				-				   u64 min_reserved)
			
 
				-{
			
 
				-	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
			
 
				-}
			
 
				-
			
 
				 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
			
 
				 			    struct btrfs_block_rsv *dst_rsv,
			
 
				 			    u64 num_bytes)
			
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 	u64 csum_bytes;
			
 
				 	unsigned nr_extents = 0;
			
 
				 	int extra_reserve = 0;
			
 
				-	int flush = 1;
			
 
				+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
			
 
				 	int ret;
			
 
				+	bool delalloc_lock = true;
			
 
				 
			
 
				-	/* Need to be holding the i_mutex here if we aren't free space cache */
			
 
				-	if (btrfs_is_free_space_inode(inode))
			
 
				-		flush = 0;
			
 
				+	/* If we are a free space inode we need to not flush since we will be in
			
 
				+	 * the middle of a transaction commit.  We also don't need the delalloc
			
 
				+	 * mutex since we won't race with anybody.  We need this mostly to make
			
 
				+	 * lockdep shut its filthy mouth.
			
 
				+	 */
			
 
				+	if (btrfs_is_free_space_inode(inode)) {
			
 
				+		flush = BTRFS_RESERVE_NO_FLUSH;
			
 
				+		delalloc_lock = false;
			
 
				+	}
			
 
				 
			
 
				-	if (flush && btrfs_transaction_in_commit(root->fs_info))
			
 
				+	if (flush != BTRFS_RESERVE_NO_FLUSH &&
			
 
				+	    btrfs_transaction_in_commit(root->fs_info))
			
 
				 		schedule_timeout(1);
			
 
				 
			
 
				-	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				+	if (delalloc_lock)
			
 
				+		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				+
			
 
				 	num_bytes = ALIGN(num_bytes, root->sectorsize);
			
 
				 
			
 
				 	spin_lock(&BTRFS_I(inode)->lock);
			
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 		ret = btrfs_qgroup_reserve(root, num_bytes +
			
 
				 					   nr_extents * root->leafsize);
			
 
				 		if (ret) {
			
 
				-			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				+			spin_lock(&BTRFS_I(inode)->lock);
			
 
				+			calc_csum_metadata_size(inode, num_bytes, 0);
			
 
				+			spin_unlock(&BTRFS_I(inode)->lock);
			
 
				+			if (delalloc_lock)
			
 
				+				mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				 			return ret;
			
 
				 		}
			
 
				 	}
			
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 						      btrfs_ino(inode),
			
 
				 						      to_free, 0);
			
 
				 		}
			
 
				-		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				+		if (root->fs_info->quota_enabled) {
			
 
				+			btrfs_qgroup_free(root, num_bytes +
			
 
				+						nr_extents * root->leafsize);
			
 
				+		}
			
 
				+		if (delalloc_lock)
			
 
				+			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				 		return ret;
			
 
				 	}
			
 
				 
			
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 	}
			
 
				 	BTRFS_I(inode)->reserved_extents += nr_extents;
			
 
				 	spin_unlock(&BTRFS_I(inode)->lock);
			
 
				-	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				+
			
 
				+	if (delalloc_lock)
			
 
				+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
			
 
				 
			
 
				 	if (to_reserve)
			
 
				 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
			
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 
				 {
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	struct btrfs_block_group_cache *cache = NULL;
			
 
				+	struct btrfs_space_info *space_info;
			
 
				+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
			
 
				 	u64 len;
			
 
				+	bool readonly;
			
 
				 
			
 
				 	while (start <= end) {
			
 
				+		readonly = false;
			
 
				 		if (!cache ||
			
 
				 		    start >= cache->key.objectid + cache->key.offset) {
			
 
				 			if (cache)
			
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 
				 		}
			
 
				 
			
 
				 		start += len;
			
 
				+		space_info = cache->space_info;
			
 
				 
			
 
				-		spin_lock(&cache->space_info->lock);
			
 
				+		spin_lock(&space_info->lock);
			
 
				 		spin_lock(&cache->lock);
			
 
				 		cache->pinned -= len;
			
 
				-		cache->space_info->bytes_pinned -= len;
			
 
				-		if (cache->ro)
			
 
				-			cache->space_info->bytes_readonly += len;
			
 
				+		space_info->bytes_pinned -= len;
			
 
				+		if (cache->ro) {
			
 
				+			space_info->bytes_readonly += len;
			
 
				+			readonly = true;
			
 
				+		}
			
 
				 		spin_unlock(&cache->lock);
			
 
				-		spin_unlock(&cache->space_info->lock);
			
 
				+		if (!readonly && global_rsv->space_info == space_info) {
			
 
				+			spin_lock(&global_rsv->lock);
			
 
				+			if (!global_rsv->full) {
			
 
				+				len = min(len, global_rsv->size -
			
 
				+					  global_rsv->reserved);
			
 
				+				global_rsv->reserved += len;
			
 
				+				space_info->bytes_may_use += len;
			
 
				+				if (global_rsv->reserved >= global_rsv->size)
			
 
				+					global_rsv->full = 1;
			
 
				+			}
			
 
				+			spin_unlock(&global_rsv->lock);
			
 
				+		}
			
 
				+		spin_unlock(&space_info->lock);
			
 
				 	}
			
 
				 
			
 
				 	if (cache)
			
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __get_block_group_index(u64 flags)
			
 
				+int __get_raid_index(u64 flags)
			
 
				 {
			
 
				 	int index;
			
 
				 
			
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
 
				 
			
 
				 static int get_block_group_index(struct btrfs_block_group_cache *cache)
			
 
				 {
			
 
				-	return __get_block_group_index(cache->flags);
			
 
				+	return __get_raid_index(cache->flags);
			
 
				 }
			
 
				 
			
 
				 enum btrfs_loop_type {
			
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 
				 	block_rsv = get_block_rsv(trans, root);
			
 
				 
			
 
				 	if (block_rsv->size == 0) {
			
 
				-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
			
 
				+		ret = reserve_metadata_bytes(root, block_rsv, blocksize,
			
 
				+					     BTRFS_RESERVE_NO_FLUSH);
			
 
				 		/*
			
 
				 		 * If we couldn't reserve metadata bytes try and use some from
			
 
				 		 * the global reserve.
			
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 
				 		static DEFINE_RATELIMIT_STATE(_rs,
			
 
				 				DEFAULT_RATELIMIT_INTERVAL,
			
 
				 				/*DEFAULT_RATELIMIT_BURST*/ 2);
			
 
				-		if (__ratelimit(&_rs)) {
			
 
				-			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
			
 
				-			WARN_ON(1);
			
 
				-		}
			
 
				-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
			
 
				+		if (__ratelimit(&_rs))
			
 
				+			WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
			
 
				+			     ret);
			
 
				+		ret = reserve_metadata_bytes(root, block_rsv, blocksize,
			
 
				+					     BTRFS_RESERVE_NO_FLUSH);
			
 
				 		if (!ret) {
			
 
				 			return block_rsv;
			
 
				 		} else if (ret && block_rsv != global_rsv) {
			
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 
				 	 */
			
 
				 	target = get_restripe_target(root->fs_info, block_group->flags);
			
 
				 	if (target) {
			
 
				-		index = __get_block_group_index(extended_to_chunk(target));
			
 
				+		index = __get_raid_index(extended_to_chunk(target));
			
 
				 	} else {
			
 
				 		/*
			
 
				 		 * this is just a balance, so if we were marked as full
			
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 
				 		 * check to make sure we can actually find a chunk with enough
			
 
				 		 * space to fit our block group in.
			
 
				 		 */
			
 
				-		if (device->total_bytes > device->bytes_used + min_free) {
			
 
				+		if (device->total_bytes > device->bytes_used + min_free &&
			
 
				+		    !device->is_tgtdev_for_dev_replace) {
			
 
				 			ret = find_free_dev_extent(device, min_free,
			
 
				 						   &dev_offset, NULL);
			
 
				 			if (!ret)
			
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
 
				 {
			
 
				 	struct rb_node *node;
			
 
				 
			
 
				-	if (end < start) {
			
 
				-		printk(KERN_ERR "btrfs end < start %llu %llu\n",
			
 
				+	if (end < start)
			
 
				+		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
			
 
				 		       (unsigned long long)end,
			
 
				 		       (unsigned long long)start);
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				 	state->start = start;
			
 
				 	state->end = end;
			
 
				 
			
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
 
				  * the standard behavior is to write all copies in a raid setup. here we only
			
 
				  * want to write the one bad copy. so we do the mapping for ourselves and issue
			
 
				  * submit_bio directly.
			
 
				- * to avoid any synchonization issues, wait for the data after writing, which
			
 
				+ * to avoid any synchronization issues, wait for the data after writing, which
			
 
				  * actually prevents the read that triggered the error from finishing.
			
 
				  * currently, there can be no more than two copies of every data bit. thus,
			
 
				  * exactly one rewrite is required.
			
 
				  */
			
 
				-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
			
 
				+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
			
 
				 			u64 length, u64 logical, struct page *page,
			
 
				 			int mirror_num)
			
 
				 {
			
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 
				 	bio->bi_size = 0;
			
 
				 	map_length = length;
			
 
				 
			
 
				-	ret = btrfs_map_block(map_tree, WRITE, logical,
			
 
				+	ret = btrfs_map_block(fs_info, WRITE, logical,
			
 
				 			      &map_length, &bbio, mirror_num);
			
 
				 	if (ret) {
			
 
				 		bio_put(bio);
			
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 
				 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				 			 int mirror_num)
			
 
				 {
			
 
				-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
			
 
				 	u64 start = eb->start;
			
 
				 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	for (i = 0; i < num_pages; i++) {
			
 
				 		struct page *p = extent_buffer_page(eb, i);
			
 
				-		ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
			
 
				+		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
			
 
				 					start, p, mirror_num);
			
 
				 		if (ret)
			
 
				 			break;
			
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
 
				 	u64 private;
			
 
				 	u64 private_failure;
			
 
				 	struct io_failure_record *failrec;
			
 
				-	struct btrfs_mapping_tree *map_tree;
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				 	struct extent_state *state;
			
 
				 	int num_copies;
			
 
				 	int did_repair = 0;
			
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
 
				 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
			
 
				 
			
 
				 	if (state && state->start == failrec->start) {
			
 
				-		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
			
 
				-		num_copies = btrfs_num_copies(map_tree, failrec->logical,
			
 
				-						failrec->len);
			
 
				+		fs_info = BTRFS_I(inode)->root->fs_info;
			
 
				+		num_copies = btrfs_num_copies(fs_info, failrec->logical,
			
 
				+					      failrec->len);
			
 
				 		if (num_copies > 1)  {
			
 
				-			ret = repair_io_failure(map_tree, start, failrec->len,
			
 
				+			ret = repair_io_failure(fs_info, start, failrec->len,
			
 
				 						failrec->logical, page,
			
 
				 						failrec->failed_mirror);
			
 
				 			did_repair = !ret;
			
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 
				 		 * clean_io_failure() clean all those errors at once.
			
 
				 		 */
			
 
				 	}
			
 
				-	num_copies = btrfs_num_copies(
			
 
				-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
			
 
				-			      failrec->logical, failrec->len);
			
 
				+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
			
 
				+				      failrec->logical, failrec->len);
			
 
				 	if (num_copies == 1) {
			
 
				 		/*
			
 
				 		 * we only have a single copy of the data, so don't bother with
			
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 
				 	return bio;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Since writes are async, they will only return -ENOMEM.
			
 
				- * Reads can return the full range of I/O error conditions.
			
 
				- */
			
 
				 static int __must_check submit_one_bio(int rw, struct bio *bio,
			
 
				 				       int mirror_num, unsigned long bio_flags)
			
 
				 {
			
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 
				 	}
			
 
				 
			
 
				 	if (start + min_len > eb->len) {
			
 
				-		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
			
 
				+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
			
 
				 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
			
 
				 		       eb->len, start, min_len);
			
 
				-		WARN_ON(1);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
 
				 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
			
 
				 		gfp_t gfp_flags);
			
 
				 
			
 
				-struct btrfs_mapping_tree;
			
 
				+struct btrfs_fs_info;
			
 
				 
			
 
				-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
			
 
				+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
			
 
				 			u64 length, u64 logical, struct page *page,
			
 
				 			int mirror_num);
			
 
				 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
			
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 
				 struct extent_map *alloc_extent_map(void)
			
 
				 {
			
 
				 	struct extent_map *em;
			
 
				-	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
			
 
				+	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
			
 
				 	if (!em)
			
 
				 		return NULL;
			
 
				 	em->in_tree = 0;
			
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 
				 			merge = rb_entry(rb, struct extent_map, rb_node);
			
 
				 		if (rb && mergable_maps(merge, em)) {
			
 
				 			em->start = merge->start;
			
 
				+			em->orig_start = merge->orig_start;
			
 
				 			em->len += merge->len;
			
 
				 			em->block_len += merge->block_len;
			
 
				 			em->block_start = merge->block_start;
			
 
				 			merge->in_tree = 0;
			
 
				-			if (merge->generation > em->generation) {
			
 
				-				em->mod_start = em->start;
			
 
				-				em->mod_len = em->len;
			
 
				-				em->generation = merge->generation;
			
 
				-				list_move(&em->list, &tree->modified_extents);
			
 
				-			}
			
 
				+			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
			
 
				+			em->mod_start = merge->mod_start;
			
 
				+			em->generation = max(em->generation, merge->generation);
			
 
				+			list_move(&em->list, &tree->modified_extents);
			
 
				 
			
 
				 			list_del_init(&merge->list);
			
 
				 			rb_erase(&merge->rb_node, &tree->map);
			
@@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 
				 		em->block_len += merge->len;
			
 
				 		rb_erase(&merge->rb_node, &tree->map);
			
 
				 		merge->in_tree = 0;
			
 
				-		if (merge->generation > em->generation) {
			
 
				-			em->mod_len = em->len;
			
 
				-			em->generation = merge->generation;
			
 
				-			list_move(&em->list, &tree->modified_extents);
			
 
				-		}
			
 
				+		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
			
 
				+		em->generation = max(em->generation, merge->generation);
			
 
				 		list_del_init(&merge->list);
			
 
				 		free_extent_map(merge);
			
 
				 	}
			
@@ -265,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 
				 	em->mod_start = em->start;
			
 
				 	em->mod_len = em->len;
			
 
				 
			
 
				-	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
			
 
				+	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
			
 
				 		prealloc = true;
			
 
				-		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
			
 
				+		clear_bit(EXTENT_FLAG_FILLING, &em->flags);
			
 
				 	}
			
 
				 
			
 
				 	try_merge_map(tree, em);
			
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
 
				 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
			
 
				 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
			
 
				 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
			
 
				+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
			
 
				 
			
 
				 struct extent_map {
			
 
				 	struct rb_node rb_node;
			
@@ -24,6 +25,7 @@ struct extent_map {
 
				 	u64 mod_start;
			
 
				 	u64 mod_len;
			
 
				 	u64 orig_start;
			
 
				+	u64 orig_block_len;
			
 
				 	u64 block_start;
			
 
				 	u64 block_len;
			
 
				 	u64 generation;
			
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 
				 	return ERR_PTR(ret);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
			
 
				 			     struct btrfs_root *root,
			
 
				 			     struct btrfs_path *path, u64 objectid,
			
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+u64 btrfs_file_extent_length(struct btrfs_path *path)
			
 
				+{
			
 
				+	int extent_type;
			
 
				+	struct btrfs_file_extent_item *fi;
			
 
				+	u64 len;
			
 
				+
			
 
				+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
			
 
				+			    struct btrfs_file_extent_item);
			
 
				+	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
			
 
				+
			
 
				+	if (extent_type == BTRFS_FILE_EXTENT_REG ||
			
 
				+	    extent_type == BTRFS_FILE_EXTENT_PREALLOC)
			
 
				+		len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
			
 
				+	else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
			
 
				+		len = btrfs_file_extent_inline_len(path->nodes[0], fi);
			
 
				+	else
			
 
				+		BUG();
			
 
				+
			
 
				+	return len;
			
 
				+}
			
 
				 
			
 
				 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
			
 
				 				   struct inode *inode, struct bio *bio,
			
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 
				 #include "compat.h"
			
 
				 #include "volumes.h"
			
 
				 
			
 
				+static struct kmem_cache *btrfs_inode_defrag_cachep;
			
 
				 /*
			
 
				  * when auto defrag is enabled we
			
 
				  * queue up these defrag structs to remember which
			
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
 
				  * If an existing record is found the defrag item you
			
 
				  * pass in is freed
			
 
				  */
			
 
				-static void __btrfs_add_inode_defrag(struct inode *inode,
			
 
				+static int __btrfs_add_inode_defrag(struct inode *inode,
			
 
				 				    struct inode_defrag *defrag)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
 
				 				entry->transid = defrag->transid;
			
 
				 			if (defrag->last_offset > entry->last_offset)
			
 
				 				entry->last_offset = defrag->last_offset;
			
 
				-			goto exists;
			
 
				+			return -EEXIST;
			
 
				 		}
			
 
				 	}
			
 
				 	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
			
 
				 	rb_link_node(&defrag->rb_node, parent, p);
			
 
				 	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
			
 
				-	return;
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-exists:
			
 
				-	kfree(defrag);
			
 
				-	return;
			
 
				+static inline int __need_auto_defrag(struct btrfs_root *root)
			
 
				+{
			
 
				+	if (!btrfs_test_opt(root, AUTO_DEFRAG))
			
 
				+		return 0;
			
 
				+
			
 
				+	if (btrfs_fs_closing(root->fs_info))
			
 
				+		return 0;
			
 
				 
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct inode_defrag *defrag;
			
 
				 	u64 transid;
			
 
				+	int ret;
			
 
				 
			
 
				-	if (!btrfs_test_opt(root, AUTO_DEFRAG))
			
 
				-		return 0;
			
 
				-
			
 
				-	if (btrfs_fs_closing(root->fs_info))
			
 
				+	if (!__need_auto_defrag(root))
			
 
				 		return 0;
			
 
				 
			
 
				 	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
			
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 
				 	else
			
 
				 		transid = BTRFS_I(inode)->root->last_trans;
			
 
				 
			
 
				-	defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
			
 
				+	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
			
 
				 	if (!defrag)
			
 
				 		return -ENOMEM;
			
 
				 
			
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 
				 	defrag->root = root->root_key.objectid;
			
 
				 
			
 
				 	spin_lock(&root->fs_info->defrag_inodes_lock);
			
 
				-	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
			
 
				-		__btrfs_add_inode_defrag(inode, defrag);
			
 
				-	else
			
 
				-		kfree(defrag);
			
 
				+	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
			
 
				+		/*
			
 
				+		 * If we set IN_DEFRAG flag and evict the inode from memory,
			
 
				+		 * and then re-read this inode, this new inode doesn't have
			
 
				+		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
			
 
				+		 */
			
 
				+		ret = __btrfs_add_inode_defrag(inode, defrag);
			
 
				+		if (ret)
			
 
				+			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+	} else {
			
 
				+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+	}
			
 
				 	spin_unlock(&root->fs_info->defrag_inodes_lock);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * must be called with the defrag_inodes lock held
			
 
				+ * Requeue the defrag object. If there is a defrag object that points to
			
 
				+ * the same inode in the tree, we will merge them together (by
			
 
				+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
			
 
				  */
			
 
				-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
			
 
				-					     u64 root, u64 ino,
			
 
				-					     struct rb_node **next)
			
 
				+void btrfs_requeue_inode_defrag(struct inode *inode,
			
 
				+				struct inode_defrag *defrag)
			
 
				+{
			
 
				+	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!__need_auto_defrag(root))
			
 
				+		goto out;
			
 
				+
			
 
				+	/*
			
 
				+	 * Here we don't check the IN_DEFRAG flag, because we need merge
			
 
				+	 * them together.
			
 
				+	 */
			
 
				+	spin_lock(&root->fs_info->defrag_inodes_lock);
			
 
				+	ret = __btrfs_add_inode_defrag(inode, defrag);
			
 
				+	spin_unlock(&root->fs_info->defrag_inodes_lock);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				+	return;
			
 
				+out:
			
 
				+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * pick the defragable inode that we want, if it doesn't exist, we will get
			
 
				+ * the next one.
			
 
				+ */
			
 
				+static struct inode_defrag *
			
 
				+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
			
 
				 {
			
 
				 	struct inode_defrag *entry = NULL;
			
 
				 	struct inode_defrag tmp;
			
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
 
				 	tmp.ino = ino;
			
 
				 	tmp.root = root;
			
 
				 
			
 
				-	p = info->defrag_inodes.rb_node;
			
 
				+	spin_lock(&fs_info->defrag_inodes_lock);
			
 
				+	p = fs_info->defrag_inodes.rb_node;
			
 
				 	while (p) {
			
 
				 		parent = p;
			
 
				 		entry = rb_entry(parent, struct inode_defrag, rb_node);
			
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
 
				 		else if (ret > 0)
			
 
				 			p = parent->rb_right;
			
 
				 		else
			
 
				-			return entry;
			
 
				+			goto out;
			
 
				 	}
			
 
				 
			
 
				-	if (next) {
			
 
				-		while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
			
 
				-			parent = rb_next(parent);
			
 
				+	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
			
 
				+		parent = rb_next(parent);
			
 
				+		if (parent)
			
 
				 			entry = rb_entry(parent, struct inode_defrag, rb_node);
			
 
				-		}
			
 
				-		*next = parent;
			
 
				+		else
			
 
				+			entry = NULL;
			
 
				 	}
			
 
				-	return NULL;
			
 
				+out:
			
 
				+	if (entry)
			
 
				+		rb_erase(parent, &fs_info->defrag_inodes);
			
 
				+	spin_unlock(&fs_info->defrag_inodes_lock);
			
 
				+	return entry;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * run through the list of inodes in the FS that need
			
 
				- * defragging
			
 
				- */
			
 
				-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
			
 
				+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
			
 
				 {
			
 
				 	struct inode_defrag *defrag;
			
 
				+	struct rb_node *node;
			
 
				+
			
 
				+	spin_lock(&fs_info->defrag_inodes_lock);
			
 
				+	node = rb_first(&fs_info->defrag_inodes);
			
 
				+	while (node) {
			
 
				+		rb_erase(node, &fs_info->defrag_inodes);
			
 
				+		defrag = rb_entry(node, struct inode_defrag, rb_node);
			
 
				+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+
			
 
				+		if (need_resched()) {
			
 
				+			spin_unlock(&fs_info->defrag_inodes_lock);
			
 
				+			cond_resched();
			
 
				+			spin_lock(&fs_info->defrag_inodes_lock);
			
 
				+		}
			
 
				+
			
 
				+		node = rb_first(&fs_info->defrag_inodes);
			
 
				+	}
			
 
				+	spin_unlock(&fs_info->defrag_inodes_lock);
			
 
				+}
			
 
				+
			
 
				+#define BTRFS_DEFRAG_BATCH	1024
			
 
				+
			
 
				+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
			
 
				+				    struct inode_defrag *defrag)
			
 
				+{
			
 
				 	struct btrfs_root *inode_root;
			
 
				 	struct inode *inode;
			
 
				-	struct rb_node *n;
			
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_ioctl_defrag_range_args range;
			
 
				-	u64 first_ino = 0;
			
 
				-	u64 root_objectid = 0;
			
 
				 	int num_defrag;
			
 
				-	int defrag_batch = 1024;
			
 
				 
			
 
				+	/* get the inode */
			
 
				+	key.objectid = defrag->root;
			
 
				+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
			
 
				+	key.offset = (u64)-1;
			
 
				+	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
			
 
				+	if (IS_ERR(inode_root)) {
			
 
				+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+		return PTR_ERR(inode_root);
			
 
				+	}
			
 
				+
			
 
				+	key.objectid = defrag->ino;
			
 
				+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
			
 
				+	key.offset = 0;
			
 
				+	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
			
 
				+	if (IS_ERR(inode)) {
			
 
				+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+		return PTR_ERR(inode);
			
 
				+	}
			
 
				+
			
 
				+	/* do a chunk of defrag */
			
 
				+	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
			
 
				 	memset(&range, 0, sizeof(range));
			
 
				 	range.len = (u64)-1;
			
 
				+	range.start = defrag->last_offset;
			
 
				+
			
 
				+	sb_start_write(fs_info->sb);
			
 
				+	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
			
 
				+				       BTRFS_DEFRAG_BATCH);
			
 
				+	sb_end_write(fs_info->sb);
			
 
				+	/*
			
 
				+	 * if we filled the whole defrag batch, there
			
 
				+	 * must be more work to do.  Queue this defrag
			
 
				+	 * again
			
 
				+	 */
			
 
				+	if (num_defrag == BTRFS_DEFRAG_BATCH) {
			
 
				+		defrag->last_offset = range.start;
			
 
				+		btrfs_requeue_inode_defrag(inode, defrag);
			
 
				+	} else if (defrag->last_offset && !defrag->cycled) {
			
 
				+		/*
			
 
				+		 * we didn't fill our defrag batch, but
			
 
				+		 * we didn't start at zero.  Make sure we loop
			
 
				+		 * around to the start of the file.
			
 
				+		 */
			
 
				+		defrag->last_offset = 0;
			
 
				+		defrag->cycled = 1;
			
 
				+		btrfs_requeue_inode_defrag(inode, defrag);
			
 
				+	} else {
			
 
				+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
			
 
				+	}
			
 
				+
			
 
				+	iput(inode);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * run through the list of inodes in the FS that need
			
 
				+ * defragging
			
 
				+ */
			
 
				+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct inode_defrag *defrag;
			
 
				+	u64 first_ino = 0;
			
 
				+	u64 root_objectid = 0;
			
 
				 
			
 
				 	atomic_inc(&fs_info->defrag_running);
			
 
				-	spin_lock(&fs_info->defrag_inodes_lock);
			
 
				 	while(1) {
			
 
				-		n = NULL;
			
 
				+		if (!__need_auto_defrag(fs_info->tree_root))
			
 
				+			break;
			
 
				 
			
 
				 		/* find an inode to defrag */
			
 
				-		defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
			
 
				-						 first_ino, &n);
			
 
				+		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
			
 
				+						 first_ino);
			
 
				 		if (!defrag) {
			
 
				-			if (n) {
			
 
				-				defrag = rb_entry(n, struct inode_defrag,
			
 
				-						  rb_node);
			
 
				-			} else if (root_objectid || first_ino) {
			
 
				+			if (root_objectid || first_ino) {
			
 
				 				root_objectid = 0;
			
 
				 				first_ino = 0;
			
 
				 				continue;
			
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		/* remove it from the rbtree */
			
 
				 		first_ino = defrag->ino + 1;
			
 
				 		root_objectid = defrag->root;
			
 
				-		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
			
 
				-
			
 
				-		if (btrfs_fs_closing(fs_info))
			
 
				-			goto next_free;
			
 
				-
			
 
				-		spin_unlock(&fs_info->defrag_inodes_lock);
			
 
				-
			
 
				-		/* get the inode */
			
 
				-		key.objectid = defrag->root;
			
 
				-		btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
			
 
				-		key.offset = (u64)-1;
			
 
				-		inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
			
 
				-		if (IS_ERR(inode_root))
			
 
				-			goto next;
			
 
				-
			
 
				-		key.objectid = defrag->ino;
			
 
				-		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
			
 
				-		key.offset = 0;
			
 
				-
			
 
				-		inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
			
 
				-		if (IS_ERR(inode))
			
 
				-			goto next;
			
 
				 
			
 
				-		/* do a chunk of defrag */
			
 
				-		clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
			
 
				-		range.start = defrag->last_offset;
			
 
				-		num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
			
 
				-					       defrag_batch);
			
 
				-		/*
			
 
				-		 * if we filled the whole defrag batch, there
			
 
				-		 * must be more work to do.  Queue this defrag
			
 
				-		 * again
			
 
				-		 */
			
 
				-		if (num_defrag == defrag_batch) {
			
 
				-			defrag->last_offset = range.start;
			
 
				-			__btrfs_add_inode_defrag(inode, defrag);
			
 
				-			/*
			
 
				-			 * we don't want to kfree defrag, we added it back to
			
 
				-			 * the rbtree
			
 
				-			 */
			
 
				-			defrag = NULL;
			
 
				-		} else if (defrag->last_offset && !defrag->cycled) {
			
 
				-			/*
			
 
				-			 * we didn't fill our defrag batch, but
			
 
				-			 * we didn't start at zero.  Make sure we loop
			
 
				-			 * around to the start of the file.
			
 
				-			 */
			
 
				-			defrag->last_offset = 0;
			
 
				-			defrag->cycled = 1;
			
 
				-			__btrfs_add_inode_defrag(inode, defrag);
			
 
				-			defrag = NULL;
			
 
				-		}
			
 
				-
			
 
				-		iput(inode);
			
 
				-next:
			
 
				-		spin_lock(&fs_info->defrag_inodes_lock);
			
 
				-next_free:
			
 
				-		kfree(defrag);
			
 
				+		__btrfs_run_defrag_inode(fs_info, defrag);
			
 
				 	}
			
 
				-	spin_unlock(&fs_info->defrag_inodes_lock);
			
 
				-
			
 
				 	atomic_dec(&fs_info->defrag_running);
			
 
				 
			
 
				 	/*
			
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 				split->block_len = em->block_len;
			
 
				 			else
			
 
				 				split->block_len = split->len;
			
 
				+			split->orig_block_len = max(split->block_len,
			
 
				+						    em->orig_block_len);
			
 
				 			split->generation = gen;
			
 
				 			split->bdev = em->bdev;
			
 
				 			split->flags = flags;
			
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 			split->flags = flags;
			
 
				 			split->compress_type = em->compress_type;
			
 
				 			split->generation = gen;
			
 
				+			split->orig_block_len = max(em->block_len,
			
 
				+						    em->orig_block_len);
			
 
				 
			
 
				 			if (compressed) {
			
 
				 				split->block_len = em->block_len;
			
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 			} else {
			
 
				 				split->block_len = split->len;
			
 
				 				split->block_start = em->block_start + diff;
			
 
				-				split->orig_start = split->start;
			
 
				+				split->orig_start = em->orig_start;
			
 
				 			}
			
 
				 
			
 
				 			ret = add_extent_mapping(em_tree, split);
			
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 
			
 
				 		balance_dirty_pages_ratelimited(inode->i_mapping);
			
 
				 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
			
 
				-			btrfs_btree_balance_dirty(root, 1);
			
 
				+			btrfs_btree_balance_dirty(root);
			
 
				 
			
 
				 		pos += copied;
			
 
				 		num_written += copied;
			
@@ -1397,6 +1463,24 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 
				 	return written ? written : err;
			
 
				 }
			
 
				 
			
 
				+static void update_time_for_write(struct inode *inode)
			
 
				+{
			
 
				+	struct timespec now;
			
 
				+
			
 
				+	if (IS_NOCMTIME(inode))
			
 
				+		return;
			
 
				+
			
 
				+	now = current_fs_time(inode->i_sb);
			
 
				+	if (!timespec_equal(&inode->i_mtime, &now))
			
 
				+		inode->i_mtime = now;
			
 
				+
			
 
				+	if (!timespec_equal(&inode->i_ctime, &now))
			
 
				+		inode->i_ctime = now;
			
 
				+
			
 
				+	if (IS_I_VERSION(inode))
			
 
				+		inode_inc_iversion(inode);
			
 
				+}
			
 
				+
			
 
				 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
			
 
				 				    const struct iovec *iov,
			
 
				 				    unsigned long nr_segs, loff_t pos)
			
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
				 	ssize_t num_written = 0;
			
 
				 	ssize_t err = 0;
			
 
				 	size_t count, ocount;
			
 
				+	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
			
 
				 
			
 
				 	sb_start_write(inode->i_sb);
			
 
				 
			
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	err = file_update_time(file);
			
 
				-	if (err) {
			
 
				-		mutex_unlock(&inode->i_mutex);
			
 
				-		goto out;
			
 
				-	}
			
 
				+	/*
			
 
				+	 * We reserve space for updating the inode when we reserve space for the
			
 
				+	 * extent we are going to write, so we will enospc out there.  We don't
			
 
				+	 * need to start yet another transaction to update the inode as we will
			
 
				+	 * update the inode when we finish writing whatever data we write.
			
 
				+	 */
			
 
				+	update_time_for_write(inode);
			
 
				 
			
 
				 	start_pos = round_down(pos, root->sectorsize);
			
 
				 	if (start_pos > i_size_read(inode)) {
			
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (sync)
			
 
				+		atomic_inc(&BTRFS_I(inode)->sync_writers);
			
 
				+
			
 
				 	if (unlikely(file->f_flags & O_DIRECT)) {
			
 
				 		num_written = __btrfs_direct_write(iocb, iov, nr_segs,
			
 
				 						   pos, ppos, count, ocount);
			
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
				 	 * this will either be one more than the running transaction
			
 
				 	 * or the generation used for the next transaction if there isn't
			
 
				 	 * one running right now.
			
 
				+	 *
			
 
				+	 * We also have to set last_sub_trans to the current log transid,
			
 
				+	 * otherwise subsequent syncs to a file that's been synced in this
			
 
				+	 * transaction will appear to have already occured.
			
 
				 	 */
			
 
				 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
			
 
				+	BTRFS_I(inode)->last_sub_trans = root->log_transid;
			
 
				 	if (num_written > 0 || num_written == -EIOCBQUEUED) {
			
 
				 		err = generic_write_sync(file, pos, num_written);
			
 
				 		if (err < 0 && num_written > 0)
			
 
				 			num_written = err;
			
 
				 	}
			
 
				 out:
			
 
				+	if (sync)
			
 
				+		atomic_dec(&BTRFS_I(inode)->sync_writers);
			
 
				 	sb_end_write(inode->i_sb);
			
 
				 	current->backing_dev_info = NULL;
			
 
				 	return num_written ? num_written : err;
			
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
				 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
			
 
				 	 * multi-task, and make the performance up.
			
 
				 	 */
			
 
				+	atomic_inc(&BTRFS_I(inode)->sync_writers);
			
 
				 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
			
 
				+	atomic_dec(&BTRFS_I(inode)->sync_writers);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
				 	 * range being left.
			
 
				 	 */
			
 
				 	atomic_inc(&root->log_batch);
			
 
				-	btrfs_wait_ordered_range(inode, start, end);
			
 
				+	btrfs_wait_ordered_range(inode, start, end - start + 1);
			
 
				 	atomic_inc(&root->log_batch);
			
 
				 
			
 
				 	/*
			
@@ -1767,6 +1866,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 
				 
			
 
				 		hole_em->block_start = EXTENT_MAP_HOLE;
			
 
				 		hole_em->block_len = 0;
			
 
				+		hole_em->orig_block_len = 0;
			
 
				 		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 		hole_em->compress_type = BTRFS_COMPRESS_NONE;
			
 
				 		hole_em->generation = trans->transid;
			
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_block_rsv *rsv;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
			
 
				-	u64 lockstart = (offset + mask) & ~mask;
			
 
				-	u64 lockend = ((offset + len) & ~mask) - 1;
			
 
				+	u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
			
 
				+	u64 lockend = round_down(offset + len,
			
 
				+				 BTRFS_I(inode)->root->sectorsize) - 1;
			
 
				 	u64 cur_offset = lockstart;
			
 
				 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
			
 
				 	u64 drop_end;
			
 
				-	unsigned long nr;
			
 
				 	int ret = 0;
			
 
				 	int err = 0;
			
 
				-	bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
			
 
				-		((offset + len) >> PAGE_CACHE_SHIFT);
			
 
				+	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
			
 
				+			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
			
 
				 
			
 
				 	btrfs_wait_ordered_range(inode, offset, len);
			
 
				 
			
 
				 	mutex_lock(&inode->i_mutex);
			
 
				-	if (offset >= inode->i_size) {
			
 
				-		mutex_unlock(&inode->i_mutex);
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				+	/*
			
 
				+	 * We needn't truncate any page which is beyond the end of the file
			
 
				+	 * because we are sure there is no data there.
			
 
				+	 */
			
 
				 	/*
			
 
				 	 * Only do this if we are in the same page and we aren't doing the
			
 
				 	 * entire page.
			
 
				 	 */
			
 
				 	if (same_page && len < PAGE_CACHE_SIZE) {
			
 
				-		ret = btrfs_truncate_page(inode, offset, len, 0);
			
 
				+		if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
			
 
				+			ret = btrfs_truncate_page(inode, offset, len, 0);
			
 
				 		mutex_unlock(&inode->i_mutex);
			
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				 	/* zero back part of the first page */
			
 
				-	ret = btrfs_truncate_page(inode, offset, 0, 0);
			
 
				-	if (ret) {
			
 
				-		mutex_unlock(&inode->i_mutex);
			
 
				-		return ret;
			
 
				+	if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
			
 
				+		ret = btrfs_truncate_page(inode, offset, 0, 0);
			
 
				+		if (ret) {
			
 
				+			mutex_unlock(&inode->i_mutex);
			
 
				+			return ret;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/* zero the front end of the last page */
			
 
				-	ret = btrfs_truncate_page(inode, offset + len, 0, 1);
			
 
				-	if (ret) {
			
 
				-		mutex_unlock(&inode->i_mutex);
			
 
				-		return ret;
			
 
				+	if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
			
 
				+		ret = btrfs_truncate_page(inode, offset + len, 0, 1);
			
 
				+		if (ret) {
			
 
				+			mutex_unlock(&inode->i_mutex);
			
 
				+			return ret;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	if (lockend < lockstart) {
			
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				-		btrfs_btree_balance_dirty(root, nr);
			
 
				+		btrfs_btree_balance_dirty(root);
			
 
				 
			
 
				 		trans = btrfs_start_transaction(root, 3);
			
 
				 		if (IS_ERR(trans)) {
			
@@ -1963,11 +2065,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 	if (!trans)
			
 
				 		goto out_free;
			
 
				 
			
 
				+	inode_inc_iversion(inode);
			
 
				+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
			
 
				+
			
 
				 	trans->block_rsv = &root->fs_info->trans_block_rsv;
			
 
				 	ret = btrfs_update_inode(trans, root, inode);
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 out_free:
			
 
				 	btrfs_free_path(path);
			
 
				 	btrfs_free_block_rsv(root, rsv);
			
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 	u64 alloc_end;
			
 
				 	u64 alloc_hint = 0;
			
 
				 	u64 locked_end;
			
 
				-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
			
 
				 	struct extent_map *em;
			
 
				+	int blocksize = BTRFS_I(inode)->root->sectorsize;
			
 
				 	int ret;
			
 
				 
			
 
				-	alloc_start = offset & ~mask;
			
 
				-	alloc_end =  (offset + len + mask) & ~mask;
			
 
				+	alloc_start = round_down(offset, blocksize);
			
 
				+	alloc_end = round_up(offset + len, blocksize);
			
 
				 
			
 
				 	/* Make sure we aren't being give some crap mode */
			
 
				 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
			
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 	 * Make sure we have enough space before we do the
			
 
				 	 * allocation.
			
 
				 	 */
			
 
				-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
			
 
				+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 		}
			
 
				 		last_byte = min(extent_map_end(em), alloc_end);
			
 
				 		actual_end = min_t(u64, extent_map_end(em), offset + len);
			
 
				-		last_byte = (last_byte + mask) & ~mask;
			
 
				+		last_byte = ALIGN(last_byte, blocksize);
			
 
				 
			
 
				 		if (em->block_start == EXTENT_MAP_HOLE ||
			
 
				 		    (cur_offset >= inode->i_size &&
			
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 out:
			
 
				 	mutex_unlock(&inode->i_mutex);
			
 
				 	/* Let go of our reservation. */
			
 
				-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
			
 
				+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
 
				 	.compat_ioctl	= btrfs_ioctl,
			
 
				 #endif
			
 
				 };
			
 
				+
			
 
				+void btrfs_auto_defrag_exit(void)
			
 
				+{
			
 
				+	if (btrfs_inode_defrag_cachep)
			
 
				+		kmem_cache_destroy(btrfs_inode_defrag_cachep);
			
 
				+}
			
 
				+
			
 
				+int btrfs_auto_defrag_init(void)
			
 
				+{
			
 
				+	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
			
 
				+					sizeof(struct inode_defrag), 0,
			
 
				+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
			
 
				+					NULL);
			
 
				+	if (!btrfs_inode_defrag_cachep)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 
				 
			
 
				 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
			
 
				 {
			
 
				-	WARN_ON(io_ctl->cur);
			
 
				 	BUG_ON(io_ctl->index >= io_ctl->num_pages);
			
 
				 	io_ctl->page = io_ctl->pages[io_ctl->index++];
			
 
				 	io_ctl->cur = kmap(io_ctl->page);
			
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 
				 			 * if previous extent entry covers the offset,
			
 
				 			 * we should return it instead of the bitmap entry
			
 
				 			 */
			
 
				-			n = &entry->offset_index;
			
 
				-			while (1) {
			
 
				-				n = rb_prev(n);
			
 
				-				if (!n)
			
 
				-					break;
			
 
				+			n = rb_prev(&entry->offset_index);
			
 
				+			if (n) {
			
 
				 				prev = rb_entry(n, struct btrfs_free_space,
			
 
				 						offset_index);
			
 
				-				if (!prev->bitmap) {
			
 
				-					if (prev->offset + prev->bytes > offset)
			
 
				-						entry = prev;
			
 
				-					break;
			
 
				-				}
			
 
				+				if (!prev->bitmap &&
			
 
				+				    prev->offset + prev->bytes > offset)
			
 
				+					entry = prev;
			
 
				 			}
			
 
				 		}
			
 
				 		return entry;
			
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 
				 	}
			
 
				 
			
 
				 	if (entry->bitmap) {
			
 
				-		n = &entry->offset_index;
			
 
				-		while (1) {
			
 
				-			n = rb_prev(n);
			
 
				-			if (!n)
			
 
				-				break;
			
 
				+		n = rb_prev(&entry->offset_index);
			
 
				+		if (n) {
			
 
				 			prev = rb_entry(n, struct btrfs_free_space,
			
 
				 					offset_index);
			
 
				-			if (!prev->bitmap) {
			
 
				-				if (prev->offset + prev->bytes > offset)
			
 
				-					return prev;
			
 
				-				break;
			
 
				-			}
			
 
				+			if (!prev->bitmap &&
			
 
				+			    prev->offset + prev->bytes > offset)
			
 
				+				return prev;
			
 
				 		}
			
 
				 		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
			
 
				 			return entry;
			
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 
				 	u64 bitmap_bytes;
			
 
				 	u64 extent_bytes;
			
 
				 	u64 size = block_group->key.offset;
			
 
				-	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
			
 
				+	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
			
 
				 	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
			
 
				 
			
 
				 	BUG_ON(ctl->total_bitmaps > max_bitmaps);
			
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 
				 	 * some block groups are so tiny they can't be enveloped by a bitmap, so
			
 
				 	 * don't even bother to create a bitmap for this
			
 
				 	 */
			
 
				-	if (BITS_PER_BITMAP * block_group->sectorsize >
			
 
				-	    block_group->key.offset)
			
 
				+	if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
			
 
				 		return false;
			
 
				 
			
 
				 	return true;
			
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 
				 	unsigned long total_found = 0;
			
 
				 	int ret;
			
 
				 
			
 
				-	i = offset_to_bit(entry->offset, block_group->sectorsize,
			
 
				+	i = offset_to_bit(entry->offset, ctl->unit,
			
 
				 			  max_t(u64, offset, entry->offset));
			
 
				-	want_bits = bytes_to_bits(bytes, block_group->sectorsize);
			
 
				-	min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
			
 
				+	want_bits = bytes_to_bits(bytes, ctl->unit);
			
 
				+	min_bits = bytes_to_bits(min_bytes, ctl->unit);
			
 
				 
			
 
				 again:
			
 
				 	found_bits = 0;
			
@@ -2325,23 +2313,22 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 
				 
			
 
				 	total_found += found_bits;
			
 
				 
			
 
				-	if (cluster->max_size < found_bits * block_group->sectorsize)
			
 
				-		cluster->max_size = found_bits * block_group->sectorsize;
			
 
				+	if (cluster->max_size < found_bits * ctl->unit)
			
 
				+		cluster->max_size = found_bits * ctl->unit;
			
 
				 
			
 
				 	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
			
 
				 		i = next_zero + 1;
			
 
				 		goto again;
			
 
				 	}
			
 
				 
			
 
				-	cluster->window_start = start * block_group->sectorsize +
			
 
				-		entry->offset;
			
 
				+	cluster->window_start = start * ctl->unit + entry->offset;
			
 
				 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
			
 
				 	ret = tree_insert_offset(&cluster->root, entry->offset,
			
 
				 				 &entry->offset_index, 1);
			
 
				 	BUG_ON(ret); /* -EEXIST; Logic error */
			
 
				 
			
 
				 	trace_btrfs_setup_cluster(block_group, cluster,
			
 
				-				  total_found * block_group->sectorsize, 1);
			
 
				+				  total_found * ctl->unit, 1);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 
				 	 * 3 items for pre-allocation
			
 
				 	 */
			
 
				 	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
			
 
				-	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
			
 
				-					  trans->bytes_reserved);
			
 
				+	ret = btrfs_block_rsv_add(root, trans->block_rsv,
			
 
				+				  trans->bytes_reserved,
			
 
				+				  BTRFS_RESERVE_NO_FLUSH);
			
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
 
				 static struct extent_io_ops btrfs_extent_io_ops;
			
 
				 
			
 
				 static struct kmem_cache *btrfs_inode_cachep;
			
 
				+static struct kmem_cache *btrfs_delalloc_work_cachep;
			
 
				 struct kmem_cache *btrfs_trans_handle_cachep;
			
 
				 struct kmem_cache *btrfs_transaction_cachep;
			
 
				 struct kmem_cache *btrfs_path_cachep;
			
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
 
				 				   struct page *locked_page,
			
 
				 				   u64 start, u64 end, int *page_started,
			
 
				 				   unsigned long *nr_written, int unlock);
			
 
				+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
			
 
				+					   u64 len, u64 orig_start,
			
 
				+					   u64 block_start, u64 block_len,
			
 
				+					   u64 orig_block_len, int type);
			
 
				 
			
 
				 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
			
 
				 				     struct inode *inode,  struct inode *dir,
			
@@ -698,14 +703,19 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
				 
			
 
				 		em->block_start = ins.objectid;
			
 
				 		em->block_len = ins.offset;
			
 
				+		em->orig_block_len = ins.offset;
			
 
				 		em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 		em->compress_type = async_extent->compress_type;
			
 
				 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
			
 
				+		em->generation = -1;
			
 
				 
			
 
				 		while (1) {
			
 
				 			write_lock(&em_tree->lock);
			
 
				 			ret = add_extent_mapping(em_tree, em);
			
 
				+			if (!ret)
			
 
				+				list_move(&em->list,
			
 
				+					  &em_tree->modified_extents);
			
 
				 			write_unlock(&em_tree->lock);
			
 
				 			if (ret != -EEXIST) {
			
 
				 				free_extent_map(em);
			
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 
				  * required to start IO on it.  It may be clean and already done with
			
 
				  * IO when we return.
			
 
				  */
			
 
				-static noinline int cow_file_range(struct inode *inode,
			
 
				-				   struct page *locked_page,
			
 
				-				   u64 start, u64 end, int *page_started,
			
 
				-				   unsigned long *nr_written,
			
 
				-				   int unlock)
			
 
				+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
			
 
				+				     struct inode *inode,
			
 
				+				     struct btrfs_root *root,
			
 
				+				     struct page *locked_page,
			
 
				+				     u64 start, u64 end, int *page_started,
			
 
				+				     unsigned long *nr_written,
			
 
				+				     int unlock)
			
 
				 {
			
 
				-	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				-	struct btrfs_trans_handle *trans;
			
 
				 	u64 alloc_hint = 0;
			
 
				 	u64 num_bytes;
			
 
				 	unsigned long ram_size;
			
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
 
				 	int ret = 0;
			
 
				 
			
 
				 	BUG_ON(btrfs_is_free_space_inode(inode));
			
 
				-	trans = btrfs_join_transaction(root);
			
 
				-	if (IS_ERR(trans)) {
			
 
				-		extent_clear_unlock_delalloc(inode,
			
 
				-			     &BTRFS_I(inode)->io_tree,
			
 
				-			     start, end, locked_page,
			
 
				-			     EXTENT_CLEAR_UNLOCK_PAGE |
			
 
				-			     EXTENT_CLEAR_UNLOCK |
			
 
				-			     EXTENT_CLEAR_DELALLOC |
			
 
				-			     EXTENT_CLEAR_DIRTY |
			
 
				-			     EXTENT_SET_WRITEBACK |
			
 
				-			     EXTENT_END_WRITEBACK);
			
 
				-		return PTR_ERR(trans);
			
 
				-	}
			
 
				-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				 
			
 
				 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
			
 
				 	num_bytes = max(blocksize,  num_bytes);
			
 
				 	disk_num_bytes = num_bytes;
			
 
				-	ret = 0;
			
 
				 
			
 
				 	/* if this is a small write inside eof, kick off defrag */
			
 
				 	if (num_bytes < 64 * 1024 &&
			
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
 
				 
			
 
				 		em->block_start = ins.objectid;
			
 
				 		em->block_len = ins.offset;
			
 
				+		em->orig_block_len = ins.offset;
			
 
				 		em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				+		em->generation = -1;
			
 
				 
			
 
				 		while (1) {
			
 
				 			write_lock(&em_tree->lock);
			
 
				 			ret = add_extent_mapping(em_tree, em);
			
 
				+			if (!ret)
			
 
				+				list_move(&em->list,
			
 
				+					  &em_tree->modified_extents);
			
 
				 			write_unlock(&em_tree->lock);
			
 
				 			if (ret != -EEXIST) {
			
 
				 				free_extent_map(em);
			
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
 
				 		alloc_hint = ins.objectid + ins.offset;
			
 
				 		start += cur_alloc_size;
			
 
				 	}
			
 
				-	ret = 0;
			
 
				 out:
			
 
				-	btrfs_end_transaction(trans, root);
			
 
				-
			
 
				 	return ret;
			
 
				+
			
 
				 out_unlock:
			
 
				 	extent_clear_unlock_delalloc(inode,
			
 
				 		     &BTRFS_I(inode)->io_tree,
			
@@ -971,6 +969,39 @@ static noinline int cow_file_range(struct inode *inode,
 
				 	goto out;
			
 
				 }
			
 
				 
			
 
				+static noinline int cow_file_range(struct inode *inode,
			
 
				+				   struct page *locked_page,
			
 
				+				   u64 start, u64 end, int *page_started,
			
 
				+				   unsigned long *nr_written,
			
 
				+				   int unlock)
			
 
				+{
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+	int ret;
			
 
				+
			
 
				+	trans = btrfs_join_transaction(root);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		extent_clear_unlock_delalloc(inode,
			
 
				+			     &BTRFS_I(inode)->io_tree,
			
 
				+			     start, end, locked_page,
			
 
				+			     EXTENT_CLEAR_UNLOCK_PAGE |
			
 
				+			     EXTENT_CLEAR_UNLOCK |
			
 
				+			     EXTENT_CLEAR_DELALLOC |
			
 
				+			     EXTENT_CLEAR_DIRTY |
			
 
				+			     EXTENT_SET_WRITEBACK |
			
 
				+			     EXTENT_END_WRITEBACK);
			
 
				+		return PTR_ERR(trans);
			
 
				+	}
			
 
				+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				+
			
 
				+	ret = __cow_file_range(trans, inode, root, locked_page, start, end,
			
 
				+			       page_started, nr_written, unlock);
			
 
				+
			
 
				+	btrfs_end_transaction(trans, root);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * work queue call back to started compression on a file and pages
			
 
				  */
			
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
				 	u64 extent_offset;
			
 
				 	u64 disk_bytenr;
			
 
				 	u64 num_bytes;
			
 
				+	u64 disk_num_bytes;
			
 
				 	int extent_type;
			
 
				 	int ret, err;
			
 
				 	int type;
			
@@ -1228,6 +1260,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
				 			extent_offset = btrfs_file_extent_offset(leaf, fi);
			
 
				 			extent_end = found_key.offset +
			
 
				 				btrfs_file_extent_num_bytes(leaf, fi);
			
 
				+			disk_num_bytes =
			
 
				+				btrfs_file_extent_disk_num_bytes(leaf, fi);
			
 
				 			if (extent_end <= start) {
			
 
				 				path->slots[0]++;
			
 
				 				goto next_slot;
			
@@ -1281,9 +1315,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
				 
			
 
				 		btrfs_release_path(path);
			
 
				 		if (cow_start != (u64)-1) {
			
 
				-			ret = cow_file_range(inode, locked_page, cow_start,
			
 
				-					found_key.offset - 1, page_started,
			
 
				-					nr_written, 1);
			
 
				+			ret = __cow_file_range(trans, inode, root, locked_page,
			
 
				+					       cow_start, found_key.offset - 1,
			
 
				+					       page_started, nr_written, 1);
			
 
				 			if (ret) {
			
 
				 				btrfs_abort_transaction(trans, root, ret);
			
 
				 				goto error;
			
@@ -1298,16 +1332,21 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
				 			em = alloc_extent_map();
			
 
				 			BUG_ON(!em); /* -ENOMEM */
			
 
				 			em->start = cur_offset;
			
 
				-			em->orig_start = em->start;
			
 
				+			em->orig_start = found_key.offset - extent_offset;
			
 
				 			em->len = num_bytes;
			
 
				 			em->block_len = num_bytes;
			
 
				 			em->block_start = disk_bytenr;
			
 
				+			em->orig_block_len = disk_num_bytes;
			
 
				 			em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				-			set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
			
 
				+			set_bit(EXTENT_FLAG_FILLING, &em->flags);
			
 
				+			em->generation = -1;
			
 
				 			while (1) {
			
 
				 				write_lock(&em_tree->lock);
			
 
				 				ret = add_extent_mapping(em_tree, em);
			
 
				+				if (!ret)
			
 
				+					list_move(&em->list,
			
 
				+						  &em_tree->modified_extents);
			
 
				 				write_unlock(&em_tree->lock);
			
 
				 				if (ret != -EEXIST) {
			
 
				 					free_extent_map(em);
			
@@ -1352,8 +1391,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
				 	}
			
 
				 
			
 
				 	if (cow_start != (u64)-1) {
			
 
				-		ret = cow_file_range(inode, locked_page, cow_start, end,
			
 
				-				     page_started, nr_written, 1);
			
 
				+		ret = __cow_file_range(trans, inode, root, locked_page,
			
 
				+				       cow_start, end,
			
 
				+				       page_started, nr_written, 1);
			
 
				 		if (ret) {
			
 
				 			btrfs_abort_transaction(trans, root, ret);
			
 
				 			goto error;
			
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 
				 			 unsigned long bio_flags)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
			
 
				-	struct btrfs_mapping_tree *map_tree;
			
 
				 	u64 logical = (u64)bio->bi_sector << 9;
			
 
				 	u64 length = 0;
			
 
				 	u64 map_length;
			
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 
				 		return 0;
			
 
				 
			
 
				 	length = bio->bi_size;
			
 
				-	map_tree = &root->fs_info->mapping_tree;
			
 
				 	map_length = length;
			
 
				-	ret = btrfs_map_block(map_tree, READ, logical,
			
 
				+	ret = btrfs_map_block(root->fs_info, READ, logical,
			
 
				 			      &map_length, NULL, 0);
			
 
				-	/* Will always return 0 or 1 with map_multi == NULL */
			
 
				+	/* Will always return 0 with map_multi == NULL */
			
 
				 	BUG_ON(ret < 0);
			
 
				 	if (map_length < length + size)
			
 
				 		return 1;
			
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 
				 			  u64 bio_offset)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				-	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
			
 
				+	if (ret)
			
 
				+		bio_endio(bio, ret);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
				 	int ret = 0;
			
 
				 	int skip_sum;
			
 
				 	int metadata = 0;
			
 
				+	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
			
 
				 
			
 
				 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
			
 
				 
			
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
				 	if (!(rw & REQ_WRITE)) {
			
 
				 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
			
 
				 		if (ret)
			
 
				-			return ret;
			
 
				+			goto out;
			
 
				 
			
 
				 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
			
 
				-			return btrfs_submit_compressed_read(inode, bio,
			
 
				-						    mirror_num, bio_flags);
			
 
				+			ret = btrfs_submit_compressed_read(inode, bio,
			
 
				+							   mirror_num,
			
 
				+							   bio_flags);
			
 
				+			goto out;
			
 
				 		} else if (!skip_sum) {
			
 
				 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
			
 
				 			if (ret)
			
 
				-				return ret;
			
 
				+				goto out;
			
 
				 		}
			
 
				 		goto mapit;
			
 
				-	} else if (!skip_sum) {
			
 
				+	} else if (async && !skip_sum) {
			
 
				 		/* csum items have already been cloned */
			
 
				 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
			
 
				 			goto mapit;
			
 
				 		/* we're doing a write, do the async checksumming */
			
 
				-		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
			
 
				+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
			
 
				 				   inode, rw, bio, mirror_num,
			
 
				 				   bio_flags, bio_offset,
			
 
				 				   __btrfs_submit_bio_start,
			
 
				 				   __btrfs_submit_bio_done);
			
 
				+		goto out;
			
 
				+	} else if (!skip_sum) {
			
 
				+		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				 	}
			
 
				 
			
 
				 mapit:
			
 
				-	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
			
 
				+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
			
 
				+
			
 
				+out:
			
 
				+	if (ret < 0)
			
 
				+		bio_endio(bio, ret);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 
				 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
			
 
				 			      struct extent_state **cached_state)
			
 
				 {
			
 
				-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
			
 
				-		WARN_ON(1);
			
 
				+	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
			
 
				 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
			
 
				 				   cached_state, GFP_NOFS);
			
 
				 }
			
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
				 
			
 
				 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
			
 
				 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
			
 
				-		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
			
 
				-		if (!ret) {
			
 
				-			if (nolock)
			
 
				-				trans = btrfs_join_transaction_nolock(root);
			
 
				-			else
			
 
				-				trans = btrfs_join_transaction(root);
			
 
				-			if (IS_ERR(trans)) {
			
 
				-				ret = PTR_ERR(trans);
			
 
				-				trans = NULL;
			
 
				-				goto out;
			
 
				-			}
			
 
				-			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				-			ret = btrfs_update_inode_fallback(trans, root, inode);
			
 
				-			if (ret) /* -ENOMEM or corruption */
			
 
				-				btrfs_abort_transaction(trans, root, ret);
			
 
				+		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
			
 
				+		if (nolock)
			
 
				+			trans = btrfs_join_transaction_nolock(root);
			
 
				+		else
			
 
				+			trans = btrfs_join_transaction(root);
			
 
				+		if (IS_ERR(trans)) {
			
 
				+			ret = PTR_ERR(trans);
			
 
				+			trans = NULL;
			
 
				+			goto out;
			
 
				 		}
			
 
				+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				+		ret = btrfs_update_inode_fallback(trans, root, inode);
			
 
				+		if (ret) /* -ENOMEM or corruption */
			
 
				+			btrfs_abort_transaction(trans, root, ret);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
				 	add_pending_csums(trans, inode, ordered_extent->file_offset,
			
 
				 			  &ordered_extent->list);
			
 
				 
			
 
				-	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
			
 
				-	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
			
 
				-		ret = btrfs_update_inode_fallback(trans, root, inode);
			
 
				-		if (ret) { /* -ENOMEM or corruption */
			
 
				-			btrfs_abort_transaction(trans, root, ret);
			
 
				-			goto out_unlock;
			
 
				-		}
			
 
				-	} else {
			
 
				-		btrfs_set_inode_last_trans(trans, inode);
			
 
				+	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
			
 
				+	ret = btrfs_update_inode_fallback(trans, root, inode);
			
 
				+	if (ret) { /* -ENOMEM or corruption */
			
 
				+		btrfs_abort_transaction(trans, root, ret);
			
 
				+		goto out_unlock;
			
 
				 	}
			
 
				 	ret = 0;
			
 
				 out_unlock:
			
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct inode *inode = dentry->d_inode;
			
 
				 	int ret;
			
 
				-	unsigned long nr = 0;
			
 
				 
			
 
				 	trans = __unlink_start_trans(dir, dentry);
			
 
				 	if (IS_ERR(trans))
			
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
				 	}
			
 
				 
			
 
				 out:
			
 
				-	nr = trans->blocks_used;
			
 
				 	__unlink_end_trans(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
				 	int err = 0;
			
 
				 	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				-	unsigned long nr = 0;
			
 
				 
			
 
				 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
			
 
				 		return -ENOTEMPTY;
			
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
				 	if (!err)
			
 
				 		btrfs_i_size_write(inode, 0);
			
 
				 out:
			
 
				-	nr = trans->blocks_used;
			
 
				 	__unlink_end_trans(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 
			
 
				 	return err;
			
 
				 }
			
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 
			
 
				-	ret = -ENOMEM;
			
 
				 again:
			
 
				 	page = find_or_create_page(mapping, index, mask);
			
 
				 	if (!page) {
			
 
				 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
			
 
				+		ret = -ENOMEM;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -3550,7 +3595,6 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				-	ret = 0;
			
 
				 	if (offset != PAGE_CACHE_SIZE) {
			
 
				 		if (!len)
			
 
				 			len = PAGE_CACHE_SIZE - offset;
			
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
				 
			
 
				 			hole_em->block_start = EXTENT_MAP_HOLE;
			
 
				 			hole_em->block_len = 0;
			
 
				+			hole_em->orig_block_len = 0;
			
 
				 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
			
 
				 			hole_em->generation = trans->transid;
			
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct btrfs_block_rsv *rsv, *global_rsv;
			
 
				 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
			
 
				-	unsigned long nr;
			
 
				 	int ret;
			
 
				 
			
 
				 	trace_btrfs_inode_evict(inode);
			
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
 
				 	 * inode item when doing the truncate.
			
 
				 	 */
			
 
				 	while (1) {
			
 
				-		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
			
 
				+		ret = btrfs_block_rsv_refill(root, rsv, min_size,
			
 
				+					     BTRFS_RESERVE_FLUSH_LIMIT);
			
 
				 
			
 
				 		/*
			
 
				 		 * Try and steal from the global reserve since we will
			
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
 
				 			goto no_delete;
			
 
				 		}
			
 
				 
			
 
				-		trans = btrfs_start_transaction_noflush(root, 1);
			
 
				+		trans = btrfs_start_transaction_lflush(root, 1);
			
 
				 		if (IS_ERR(trans)) {
			
 
				 			btrfs_orphan_del(NULL, inode);
			
 
				 			btrfs_free_block_rsv(root, rsv);
			
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
 
				 		ret = btrfs_update_inode(trans, root, inode);
			
 
				 		BUG_ON(ret);
			
 
				 
			
 
				-		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				 		trans = NULL;
			
 
				-		btrfs_btree_balance_dirty(root, nr);
			
 
				+		btrfs_btree_balance_dirty(root);
			
 
				 	}
			
 
				 
			
 
				 	btrfs_free_block_rsv(root, rsv);
			
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
 
				 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
			
 
				 		btrfs_return_ino(root, btrfs_ino(inode));
			
 
				 
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 no_delete:
			
 
				 	clear_inode(inode);
			
 
				 	return;
			
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
				 	if (S_ISREG(mode)) {
			
 
				 		if (btrfs_test_opt(root, NODATASUM))
			
 
				 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
			
 
				-		if (btrfs_test_opt(root, NODATACOW) ||
			
 
				-		    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
			
 
				+		if (btrfs_test_opt(root, NODATACOW))
			
 
				 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
			
 
				 	}
			
 
				 
			
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 
				 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
			
 
				 				    parent_inode, &key,
			
 
				 				    btrfs_inode_type(inode), index);
			
 
				-	if (ret == -EEXIST)
			
 
				+	if (ret == -EEXIST || ret == -EOVERFLOW)
			
 
				 		goto fail_dir_item;
			
 
				 	else if (ret) {
			
 
				 		btrfs_abort_transaction(trans, root, ret);
			
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 
				 	int err;
			
 
				 	int drop_inode = 0;
			
 
				 	u64 objectid;
			
 
				-	unsigned long nr = 0;
			
 
				 	u64 index = 0;
			
 
				 
			
 
				 	if (!new_valid_dev(rdev))
			
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				+	err = btrfs_update_inode(trans, root, inode);
			
 
				+	if (err) {
			
 
				+		drop_inode = 1;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	* If the active LSM wants to access the inode during
			
 
				 	* d_instantiate it needs these. Smack checks to see
			
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 
				 		d_instantiate(dentry, inode);
			
 
				 	}
			
 
				 out_unlock:
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	if (drop_inode) {
			
 
				 		inode_dec_link_count(inode);
			
 
				 		iput(inode);
			
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				 	struct inode *inode = NULL;
			
 
				-	int drop_inode = 0;
			
 
				+	int drop_inode_on_err = 0;
			
 
				 	int err;
			
 
				-	unsigned long nr = 0;
			
 
				 	u64 objectid;
			
 
				 	u64 index = 0;
			
 
				 
			
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 
				 		err = PTR_ERR(inode);
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				+	drop_inode_on_err = 1;
			
 
				 
			
 
				 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
			
 
				-	if (err) {
			
 
				-		drop_inode = 1;
			
 
				+	if (err)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	err = btrfs_update_inode(trans, root, inode);
			
 
				+	if (err)
			
 
				 		goto out_unlock;
			
 
				-	}
			
 
				 
			
 
				 	/*
			
 
				 	* If the active LSM wants to access the inode during
			
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 
				 
			
 
				 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
			
 
				 	if (err)
			
 
				-		drop_inode = 1;
			
 
				-	else {
			
 
				-		inode->i_mapping->a_ops = &btrfs_aops;
			
 
				-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
			
 
				-		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
			
 
				-		d_instantiate(dentry, inode);
			
 
				-	}
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	inode->i_mapping->a_ops = &btrfs_aops;
			
 
				+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
			
 
				+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
			
 
				+	d_instantiate(dentry, inode);
			
 
				+
			
 
				 out_unlock:
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-	if (drop_inode) {
			
 
				+	if (err && drop_inode_on_err) {
			
 
				 		inode_dec_link_count(inode);
			
 
				 		iput(inode);
			
 
				 	}
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
				 	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				 	struct inode *inode = old_dentry->d_inode;
			
 
				 	u64 index;
			
 
				-	unsigned long nr = 0;
			
 
				 	int err;
			
 
				 	int drop_inode = 0;
			
 
				 
			
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
				 	inode_inc_iversion(inode);
			
 
				 	inode->i_ctime = CURRENT_TIME;
			
 
				 	ihold(inode);
			
 
				+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
			
 
				 
			
 
				 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
			
 
				 
			
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
				 		btrfs_log_new_name(trans, inode, NULL, parent);
			
 
				 	}
			
 
				 
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				 fail:
			
 
				 	if (drop_inode) {
			
 
				 		inode_dec_link_count(inode);
			
 
				 		iput(inode);
			
 
				 	}
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
				 	int drop_on_err = 0;
			
 
				 	u64 objectid = 0;
			
 
				 	u64 index = 0;
			
 
				-	unsigned long nr = 1;
			
 
				 
			
 
				 	/*
			
 
				 	 * 2 items for inode and ref
			
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
				 	drop_on_err = 0;
			
 
				 
			
 
				 out_fail:
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				 	if (drop_on_err)
			
 
				 		iput(inode);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -5340,6 +5384,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
				 		if (start + len <= found_key.offset)
			
 
				 			goto not_found;
			
 
				 		em->start = start;
			
 
				+		em->orig_start = start;
			
 
				 		em->len = found_key.offset - start;
			
 
				 		goto not_found_em;
			
 
				 	}
			
@@ -5350,6 +5395,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
				 		em->len = extent_end - extent_start;
			
 
				 		em->orig_start = extent_start -
			
 
				 				 btrfs_file_extent_offset(leaf, item);
			
 
				+		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
			
 
				+								      item);
			
 
				 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
			
 
				 		if (bytenr == 0) {
			
 
				 			em->block_start = EXTENT_MAP_HOLE;
			
@@ -5359,8 +5406,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
				 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
			
 
				 			em->compress_type = compress_type;
			
 
				 			em->block_start = bytenr;
			
 
				-			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
			
 
				-									 item);
			
 
				+			em->block_len = em->orig_block_len;
			
 
				 		} else {
			
 
				 			bytenr += btrfs_file_extent_offset(leaf, item);
			
 
				 			em->block_start = bytenr;
			
@@ -5390,7 +5436,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
				 		em->start = extent_start + extent_offset;
			
 
				 		em->len = (copy_size + root->sectorsize - 1) &
			
 
				 			~((u64)root->sectorsize - 1);
			
 
				-		em->orig_start = EXTENT_MAP_INLINE;
			
 
				+		em->orig_block_len = em->len;
			
 
				+		em->orig_start = em->start;
			
 
				 		if (compress_type) {
			
 
				 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
			
 
				 			em->compress_type = compress_type;
			
@@ -5439,11 +5486,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
				 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
			
 
				 		goto insert;
			
 
				 	} else {
			
 
				-		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
			
 
				-		WARN_ON(1);
			
 
				+		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
			
 
				 	}
			
 
				 not_found:
			
 
				 	em->start = start;
			
 
				+	em->orig_start = start;
			
 
				 	em->len = len;
			
 
				 not_found_em:
			
 
				 	em->block_start = EXTENT_MAP_HOLE;
			
@@ -5645,38 +5692,19 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 
				 }
			
 
				 
			
 
				 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
			
 
				-						  struct extent_map *em,
			
 
				 						  u64 start, u64 len)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				+	struct extent_map *em;
			
 
				 	struct btrfs_key ins;
			
 
				 	u64 alloc_hint;
			
 
				 	int ret;
			
 
				-	bool insert = false;
			
 
				-
			
 
				-	/*
			
 
				-	 * Ok if the extent map we looked up is a hole and is for the exact
			
 
				-	 * range we want, there is no reason to allocate a new one, however if
			
 
				-	 * it is not right then we need to free this one and drop the cache for
			
 
				-	 * our range.
			
 
				-	 */
			
 
				-	if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
			
 
				-	    em->len != len) {
			
 
				-		free_extent_map(em);
			
 
				-		em = NULL;
			
 
				-		insert = true;
			
 
				-		btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
			
 
				-	}
			
 
				 
			
 
				 	trans = btrfs_join_transaction(root);
			
 
				 	if (IS_ERR(trans))
			
 
				 		return ERR_CAST(trans);
			
 
				 
			
 
				-	if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
			
 
				-		btrfs_add_inode_defrag(trans, inode);
			
 
				-
			
 
				 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
			
 
				 
			
 
				 	alloc_hint = get_extent_allocation_hint(inode, start, len);
			
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	if (!em) {
			
 
				-		em = alloc_extent_map();
			
 
				-		if (!em) {
			
 
				-			em = ERR_PTR(-ENOMEM);
			
 
				-			goto out;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	em->start = start;
			
 
				-	em->orig_start = em->start;
			
 
				-	em->len = ins.offset;
			
 
				-
			
 
				-	em->block_start = ins.objectid;
			
 
				-	em->block_len = ins.offset;
			
 
				-	em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				-
			
 
				-	/*
			
 
				-	 * We need to do this because if we're using the original em we searched
			
 
				-	 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
			
 
				-	 */
			
 
				-	em->flags = 0;
			
 
				-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				-
			
 
				-	while (insert) {
			
 
				-		write_lock(&em_tree->lock);
			
 
				-		ret = add_extent_mapping(em_tree, em);
			
 
				-		write_unlock(&em_tree->lock);
			
 
				-		if (ret != -EEXIST)
			
 
				-			break;
			
 
				-		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
			
 
				-	}
			
 
				+	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
			
 
				+			      ins.offset, ins.offset, 0);
			
 
				+	if (IS_ERR(em))
			
 
				+		goto out;
			
 
				 
			
 
				 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
			
 
				 					   ins.offset, ins.offset, 0);
			
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
				 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
			
 
				 					   u64 len, u64 orig_start,
			
 
				 					   u64 block_start, u64 block_len,
			
 
				-					   int type)
			
 
				+					   u64 orig_block_len, int type)
			
 
				 {
			
 
				 	struct extent_map_tree *em_tree;
			
 
				 	struct extent_map *em;
			
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
				 	em->block_len = block_len;
			
 
				 	em->block_start = block_start;
			
 
				 	em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				+	em->orig_block_len = orig_block_len;
			
 
				+	em->generation = -1;
			
 
				 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				 	if (type == BTRFS_ORDERED_PREALLOC)
			
 
				-		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
			
 
				+		set_bit(EXTENT_FLAG_FILLING, &em->flags);
			
 
				 
			
 
				 	do {
			
 
				 		btrfs_drop_extent_cache(inode, em->start,
			
 
				 				em->start + em->len - 1, 0);
			
 
				 		write_lock(&em_tree->lock);
			
 
				 		ret = add_extent_mapping(em_tree, em);
			
 
				+		if (!ret)
			
 
				+			list_move(&em->list,
			
 
				+				  &em_tree->modified_extents);
			
 
				 		write_unlock(&em_tree->lock);
			
 
				 	} while (ret == -EEXIST);
			
 
				 
			
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 
				 			goto must_cow;
			
 
				 
			
 
				 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
			
 
				-			u64 orig_start = em->start;
			
 
				+			u64 orig_start = em->orig_start;
			
 
				+			u64 orig_block_len = em->orig_block_len;
			
 
				 
			
 
				 			if (type == BTRFS_ORDERED_PREALLOC) {
			
 
				 				free_extent_map(em);
			
 
				 				em = create_pinned_em(inode, start, len,
			
 
				 						       orig_start,
			
 
				-						       block_start, len, type);
			
 
				+						       block_start, len,
			
 
				+						       orig_block_len, type);
			
 
				 				if (IS_ERR(em)) {
			
 
				 					btrfs_end_transaction(trans, root);
			
 
				 					goto unlock_err;
			
@@ -6077,7 +6085,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 
				 	 * it above
			
 
				 	 */
			
 
				 	len = bh_result->b_size;
			
 
				-	em = btrfs_new_extent_direct(inode, em, start, len);
			
 
				+	free_extent_map(em);
			
 
				+	em = btrfs_new_extent_direct(inode, start, len);
			
 
				 	if (IS_ERR(em)) {
			
 
				 		ret = PTR_ERR(em);
			
 
				 		goto unlock_err;
			
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	int ret;
			
 
				 
			
 
				+	if (async_submit)
			
 
				+		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
			
 
				+
			
 
				 	bio_get(bio);
			
 
				 
			
 
				 	if (!write) {
			
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 {
			
 
				 	struct inode *inode = dip->inode;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
			
 
				 	struct bio *bio;
			
 
				 	struct bio *orig_bio = dip->orig_bio;
			
 
				 	struct bio_vec *bvec = orig_bio->bi_io_vec;
			
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 	int async_submit = 0;
			
 
				 
			
 
				 	map_length = orig_bio->bi_size;
			
 
				-	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
			
 
				+	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
			
 
				 			      &map_length, NULL, 0);
			
 
				 	if (ret) {
			
 
				 		bio_put(orig_bio);
			
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 			bio->bi_end_io = btrfs_end_dio_bio;
			
 
				 
			
 
				 			map_length = orig_bio->bi_size;
			
 
				-			ret = btrfs_map_block(map_tree, READ, start_sector << 9,
			
 
				+			ret = btrfs_map_block(root->fs_info, READ,
			
 
				+					      start_sector << 9,
			
 
				 					      &map_length, NULL, 0);
			
 
				 			if (ret) {
			
 
				 				bio_put(bio);
			
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 
				 		   btrfs_submit_direct, 0);
			
 
				 }
			
 
				 
			
 
				+#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
			
 
				+
			
 
				 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
			
 
				 		__u64 start, __u64 len)
			
 
				 {
			
 
				+	int	ret;
			
 
				+
			
 
				+	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
			
 
				 }
			
 
				 
			
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
 
				 	int ret;
			
 
				 	int err = 0;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				-	unsigned long nr;
			
 
				 	u64 mask = root->sectorsize - 1;
			
 
				 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
			
 
				 
			
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				-		btrfs_btree_balance_dirty(root, nr);
			
 
				+		btrfs_btree_balance_dirty(root);
			
 
				 
			
 
				 		trans = btrfs_start_transaction(root, 2);
			
 
				 		if (IS_ERR(trans)) {
			
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
 
				 		if (ret && !err)
			
 
				 			err = ret;
			
 
				 
			
 
				-		nr = trans->blocks_used;
			
 
				 		ret = btrfs_end_transaction(trans, root);
			
 
				-		btrfs_btree_balance_dirty(root, nr);
			
 
				+		btrfs_btree_balance_dirty(root);
			
 
				 	}
			
 
				 
			
 
				 out:
			
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
				 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
			
 
				 	ei->io_tree.track_uptodate = 1;
			
 
				 	ei->io_failure_tree.track_uptodate = 1;
			
 
				+	atomic_set(&ei->sync_writers, 0);
			
 
				 	mutex_init(&ei->log_mutex);
			
 
				 	mutex_init(&ei->delalloc_mutex);
			
 
				 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
			
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
 
				 		kmem_cache_destroy(btrfs_path_cachep);
			
 
				 	if (btrfs_free_space_cachep)
			
 
				 		kmem_cache_destroy(btrfs_free_space_cachep);
			
 
				+	if (btrfs_delalloc_work_cachep)
			
 
				+		kmem_cache_destroy(btrfs_delalloc_work_cachep);
			
 
				 }
			
 
				 
			
 
				 int btrfs_init_cachep(void)
			
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
 
				 	if (!btrfs_free_space_cachep)
			
 
				 		goto fail;
			
 
				 
			
 
				+	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
			
 
				+			sizeof(struct btrfs_delalloc_work), 0,
			
 
				+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
			
 
				+			NULL);
			
 
				+	if (!btrfs_delalloc_work_cachep)
			
 
				+		goto fail;
			
 
				+
			
 
				 	return 0;
			
 
				 fail:
			
 
				 	btrfs_destroy_cachep();
			
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
			
 
				 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
			
 
				 		return -ENOTEMPTY;
			
 
				+
			
 
				+
			
 
				+	/* check for collisions, even if the  name isn't there */
			
 
				+	ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
			
 
				+			     new_dentry->d_name.name,
			
 
				+			     new_dentry->d_name.len);
			
 
				+
			
 
				+	if (ret) {
			
 
				+		if (ret == -EEXIST) {
			
 
				+			/* we shouldn't get
			
 
				+			 * eexist without a new_inode */
			
 
				+			if (!new_inode) {
			
 
				+				WARN_ON(1);
			
 
				+				return ret;
			
 
				+			}
			
 
				+		} else {
			
 
				+			/* maybe -EOVERFLOW */
			
 
				+			return ret;
			
 
				+		}
			
 
				+	}
			
 
				+	ret = 0;
			
 
				+
			
 
				 	/*
			
 
				 	 * we're using rename to replace one file with another.
			
 
				 	 * and the replacement file is large.  Start IO on it now so
			
@@ -7447,6 +7496,49 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static void btrfs_run_delalloc_work(struct btrfs_work *work)
			
 
				+{
			
 
				+	struct btrfs_delalloc_work *delalloc_work;
			
 
				+
			
 
				+	delalloc_work = container_of(work, struct btrfs_delalloc_work,
			
 
				+				     work);
			
 
				+	if (delalloc_work->wait)
			
 
				+		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
			
 
				+	else
			
 
				+		filemap_flush(delalloc_work->inode->i_mapping);
			
 
				+
			
 
				+	if (delalloc_work->delay_iput)
			
 
				+		btrfs_add_delayed_iput(delalloc_work->inode);
			
 
				+	else
			
 
				+		iput(delalloc_work->inode);
			
 
				+	complete(&delalloc_work->completion);
			
 
				+}
			
 
				+
			
 
				+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
			
 
				+						    int wait, int delay_iput)
			
 
				+{
			
 
				+	struct btrfs_delalloc_work *work;
			
 
				+
			
 
				+	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
			
 
				+	if (!work)
			
 
				+		return NULL;
			
 
				+
			
 
				+	init_completion(&work->completion);
			
 
				+	INIT_LIST_HEAD(&work->list);
			
 
				+	work->inode = inode;
			
 
				+	work->wait = wait;
			
 
				+	work->delay_iput = delay_iput;
			
 
				+	work->work.func = btrfs_run_delalloc_work;
			
 
				+
			
 
				+	return work;
			
 
				+}
			
 
				+
			
 
				+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
			
 
				+{
			
 
				+	wait_for_completion(&work->completion);
			
 
				+	kmem_cache_free(btrfs_delalloc_work_cachep, work);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * some fairly slow code that needs optimization. This walks the list
			
 
				  * of all the inodes with pending delalloc and forces them to disk.
			
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 
				 	struct list_head *head = &root->fs_info->delalloc_inodes;
			
 
				 	struct btrfs_inode *binode;
			
 
				 	struct inode *inode;
			
 
				+	struct btrfs_delalloc_work *work, *next;
			
 
				+	struct list_head works;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	if (root->fs_info->sb->s_flags & MS_RDONLY)
			
 
				 		return -EROFS;
			
 
				 
			
 
				+	INIT_LIST_HEAD(&works);
			
 
				+
			
 
				 	spin_lock(&root->fs_info->delalloc_lock);
			
 
				 	while (!list_empty(head)) {
			
 
				 		binode = list_entry(head->next, struct btrfs_inode,
			
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 
				 			list_del_init(&binode->delalloc_inodes);
			
 
				 		spin_unlock(&root->fs_info->delalloc_lock);
			
 
				 		if (inode) {
			
 
				-			filemap_flush(inode->i_mapping);
			
 
				-			if (delay_iput)
			
 
				-				btrfs_add_delayed_iput(inode);
			
 
				-			else
			
 
				-				iput(inode);
			
 
				+			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
			
 
				+			if (!work) {
			
 
				+				ret = -ENOMEM;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			list_add_tail(&work->list, &works);
			
 
				+			btrfs_queue_worker(&root->fs_info->flush_workers,
			
 
				+					   &work->work);
			
 
				 		}
			
 
				 		cond_resched();
			
 
				 		spin_lock(&root->fs_info->delalloc_lock);
			
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 
				 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
			
 
				 	}
			
 
				 	atomic_dec(&root->fs_info->async_submit_draining);
			
 
				-	return 0;
			
 
				+out:
			
 
				+	list_for_each_entry_safe(work, next, &works, list) {
			
 
				+		list_del_init(&work->list);
			
 
				+		btrfs_wait_and_free_delalloc_work(work);
			
 
				+	}
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
			
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
				 	unsigned long ptr;
			
 
				 	struct btrfs_file_extent_item *ei;
			
 
				 	struct extent_buffer *leaf;
			
 
				-	unsigned long nr = 0;
			
 
				 
			
 
				 	name_len = strlen(symname) + 1;
			
 
				 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
			
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
				 out_unlock:
			
 
				 	if (!err)
			
 
				 		d_instantiate(dentry, inode);
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				 	if (drop_inode) {
			
 
				 		inode_dec_link_count(inode);
			
 
				 		iput(inode);
			
 
				 	}
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
				 		em->len = ins.offset;
			
 
				 		em->block_start = ins.objectid;
			
 
				 		em->block_len = ins.offset;
			
 
				+		em->orig_block_len = ins.offset;
			
 
				 		em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
			
 
				 		em->generation = trans->transid;
			
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
 
				 #include "backref.h"
			
 
				 #include "rcu-string.h"
			
 
				 #include "send.h"
			
 
				+#include "dev-replace.h"
			
 
				 
			
 
				 /* Mask out flags that are inappropriate for the given type of inode. */
			
 
				 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
			
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
				 		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
			
 
				 	}
			
 
				 
			
 
				-	if (flags & BTRFS_INODE_NODATACOW)
			
 
				+	if (flags & BTRFS_INODE_NODATACOW) {
			
 
				 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
			
 
				+		if (S_ISREG(inode->i_mode))
			
 
				+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
			
 
				+	}
			
 
				 
			
 
				 	btrfs_update_iflags(inode);
			
 
				 }
			
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 
				 		ret = btrfs_commit_transaction(trans,
			
 
				 					       root->fs_info->extent_root);
			
 
				 	}
			
 
				-	if (ret)
			
 
				+	if (ret) {
			
 
				+		/* cleanup_transaction has freed this for us */
			
 
				+		if (trans->aborted)
			
 
				+			pending_snapshot = NULL;
			
 
				 		goto fail;
			
 
				+	}
			
 
				 
			
 
				 	ret = pending_snapshot->error;
			
 
				 	if (ret)
			
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
 
				 	if (error)
			
 
				 		goto out_dput;
			
 
				 
			
 
				+	/*
			
 
				+	 * even if this name doesn't exist, we may get hash collisions.
			
 
				+	 * check for them now when we can safely fail
			
 
				+	 */
			
 
				+	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
			
 
				+					       dir->i_ino, name,
			
 
				+					       namelen);
			
 
				+	if (error)
			
 
				+		goto out_dput;
			
 
				+
			
 
				 	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
			
 
				 
			
 
				 	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
			
@@ -1293,12 +1311,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
			
 
				+static noinline int btrfs_ioctl_resize(struct file *file,
			
 
				 					void __user *arg)
			
 
				 {
			
 
				 	u64 new_size;
			
 
				 	u64 old_size;
			
 
				 	u64 devid = 1;
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_vol_args *vol_args;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct btrfs_device *device = NULL;
			
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	mutex_lock(&root->fs_info->volume_mutex);
			
 
				-	if (root->fs_info->balance_ctl) {
			
 
				-		printk(KERN_INFO "btrfs: balance in progress\n");
			
 
				-		ret = -EINVAL;
			
 
				-		goto out;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			1)) {
			
 
				+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
			
 
				+		return -EINPROGRESS;
			
 
				 	}
			
 
				 
			
 
				+	mutex_lock(&root->fs_info->volume_mutex);
			
 
				 	vol_args = memdup_user(arg, sizeof(*vol_args));
			
 
				 	if (IS_ERR(vol_args)) {
			
 
				 		ret = PTR_ERR(vol_args);
			
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
				 		printk(KERN_INFO "btrfs: resizing devid %llu\n",
			
 
				 		       (unsigned long long)devid);
			
 
				 	}
			
 
				-	device = btrfs_find_device(root, devid, NULL, NULL);
			
 
				+	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
			
 
				 	if (!device) {
			
 
				 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
			
 
				 		       (unsigned long long)devid);
			
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (device->is_tgtdev_for_dev_replace) {
			
 
				+		ret = -EINVAL;
			
 
				+		goto out_free;
			
 
				+	}
			
 
				+
			
 
				 	old_size = device->total_bytes;
			
 
				 
			
 
				 	if (mod < 0) {
			
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
				 		btrfs_commit_transaction(trans, root);
			
 
				 	} else if (new_size < old_size) {
			
 
				 		ret = btrfs_shrink_device(device, new_size);
			
 
				-	}
			
 
				+	} /* equal, nothing need to do */
			
 
				 
			
 
				 out_free:
			
 
				 	kfree(vol_args);
			
 
				 out:
			
 
				 	mutex_unlock(&root->fs_info->volume_mutex);
			
 
				+	mnt_drop_write_file(file);
			
 
				+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 
				 	if (btrfs_root_readonly(root))
			
 
				 		return -EROFS;
			
 
				 
			
 
				+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			1)) {
			
 
				+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
			
 
				+		return -EINPROGRESS;
			
 
				+	}
			
 
				 	ret = mnt_want_write_file(file);
			
 
				-	if (ret)
			
 
				+	if (ret) {
			
 
				+		atomic_set(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			   0);
			
 
				 		return ret;
			
 
				+	}
			
 
				 
			
 
				 	switch (inode->i_mode & S_IFMT) {
			
 
				 	case S_IFDIR:
			
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 
				 	}
			
 
				 out:
			
 
				 	mnt_drop_write_file(file);
			
 
				+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	mutex_lock(&root->fs_info->volume_mutex);
			
 
				-	if (root->fs_info->balance_ctl) {
			
 
				-		printk(KERN_INFO "btrfs: balance in progress\n");
			
 
				-		ret = -EINVAL;
			
 
				-		goto out;
			
 
				+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			1)) {
			
 
				+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
			
 
				+		return -EINPROGRESS;
			
 
				 	}
			
 
				 
			
 
				+	mutex_lock(&root->fs_info->volume_mutex);
			
 
				 	vol_args = memdup_user(arg, sizeof(*vol_args));
			
 
				 	if (IS_ERR(vol_args)) {
			
 
				 		ret = PTR_ERR(vol_args);
			
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 
				 	kfree(vol_args);
			
 
				 out:
			
 
				 	mutex_unlock(&root->fs_info->volume_mutex);
			
 
				+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
			
 
				+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
			
 
				 {
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_vol_args *vol_args;
			
 
				 	int ret;
			
 
				 
			
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	if (root->fs_info->sb->s_flags & MS_RDONLY)
			
 
				-		return -EROFS;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				-	mutex_lock(&root->fs_info->volume_mutex);
			
 
				-	if (root->fs_info->balance_ctl) {
			
 
				-		printk(KERN_INFO "btrfs: balance in progress\n");
			
 
				-		ret = -EINVAL;
			
 
				-		goto out;
			
 
				+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			1)) {
			
 
				+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
			
 
				+		mnt_drop_write_file(file);
			
 
				+		return -EINPROGRESS;
			
 
				 	}
			
 
				 
			
 
				+	mutex_lock(&root->fs_info->volume_mutex);
			
 
				 	vol_args = memdup_user(arg, sizeof(*vol_args));
			
 
				 	if (IS_ERR(vol_args)) {
			
 
				 		ret = PTR_ERR(vol_args);
			
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 
				 	kfree(vol_args);
			
 
				 out:
			
 
				 	mutex_unlock(&root->fs_info->volume_mutex);
			
 
				+	mnt_drop_write_file(file);
			
 
				+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 
				 		s_uuid = di_args->uuid;
			
 
				 
			
 
				 	mutex_lock(&fs_devices->device_list_mutex);
			
 
				-	dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
			
 
				+	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
			
 
				 	mutex_unlock(&fs_devices->device_list_mutex);
			
 
				 
			
 
				 	if (!dev) {
			
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
				 	struct btrfs_disk_key disk_key;
			
 
				 	u64 objectid = 0;
			
 
				 	u64 dir_id;
			
 
				+	int ret;
			
 
				 
			
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	if (copy_from_user(&objectid, argp, sizeof(objectid)))
			
 
				-		return -EFAULT;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
			
 
				+		ret = -EFAULT;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	if (!objectid)
			
 
				 		objectid = root->root_key.objectid;
			
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
				 	location.offset = (u64)-1;
			
 
				 
			
 
				 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
			
 
				-	if (IS_ERR(new_root))
			
 
				-		return PTR_ERR(new_root);
			
 
				+	if (IS_ERR(new_root)) {
			
 
				+		ret = PTR_ERR(new_root);
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				-	if (btrfs_root_refs(&new_root->root_item) == 0)
			
 
				-		return -ENOENT;
			
 
				+	if (btrfs_root_refs(&new_root->root_item) == 0) {
			
 
				+		ret = -ENOENT;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				-	if (!path)
			
 
				-		return -ENOMEM;
			
 
				+	if (!path) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				 	path->leave_spinning = 1;
			
 
				 
			
 
				 	trans = btrfs_start_transaction(root, 1);
			
 
				 	if (IS_ERR(trans)) {
			
 
				 		btrfs_free_path(path);
			
 
				-		return PTR_ERR(trans);
			
 
				+		ret = PTR_ERR(trans);
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				 	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
			
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
				 		btrfs_end_transaction(trans, root);
			
 
				 		printk(KERN_ERR "Umm, you don't have the default dir item, "
			
 
				 		       "this isn't going to work\n");
			
 
				-		return -ENOENT;
			
 
				+		ret = -ENOENT;
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
			
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
				 
			
 
				 	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-
			
 
				-	return 0;
			
 
				+out:
			
 
				+	mnt_drop_write_file(file);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 void btrfs_get_block_group_info(struct list_head *groups_list,
			
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
			
 
				+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
			
 
				+					    void __user *argp)
			
 
				 {
			
 
				-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	u64 transid;
			
 
				 	int ret;
			
 
				 
			
 
				-	trans = btrfs_start_transaction(root, 0);
			
 
				-	if (IS_ERR(trans))
			
 
				-		return PTR_ERR(trans);
			
 
				+	trans = btrfs_attach_transaction(root);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		if (PTR_ERR(trans) != -ENOENT)
			
 
				+			return PTR_ERR(trans);
			
 
				+
			
 
				+		/* No running transaction, don't bother */
			
 
				+		transid = root->fs_info->last_trans_committed;
			
 
				+		goto out;
			
 
				+	}
			
 
				 	transid = trans->transid;
			
 
				 	ret = btrfs_commit_transaction_async(trans, root, 0);
			
 
				 	if (ret) {
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				 		return ret;
			
 
				 	}
			
 
				-
			
 
				+out:
			
 
				 	if (argp)
			
 
				 		if (copy_to_user(argp, &transid, sizeof(transid)))
			
 
				 			return -EFAULT;
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
			
 
				+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
			
 
				+					   void __user *argp)
			
 
				 {
			
 
				-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
			
 
				 	u64 transid;
			
 
				 
			
 
				 	if (argp) {
			
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
 
				 	return btrfs_wait_for_commit(root, transid);
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
			
 
				+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
			
 
				 {
			
 
				-	int ret;
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_scrub_args *sa;
			
 
				+	int ret;
			
 
				 
			
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
 
				 	if (IS_ERR(sa))
			
 
				 		return PTR_ERR(sa);
			
 
				 
			
 
				-	ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
			
 
				-			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
			
 
				+	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
			
 
				+		ret = mnt_want_write_file(file);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+	}
			
 
				+
			
 
				+	ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
			
 
				+			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
			
 
				+			      0);
			
 
				 
			
 
				 	if (copy_to_user(arg, sa, sizeof(*sa)))
			
 
				 		ret = -EFAULT;
			
 
				 
			
 
				+	if (!(sa->flags & BTRFS_SCRUB_READONLY))
			
 
				+		mnt_drop_write_file(file);
			
 
				+out:
			
 
				 	kfree(sa);
			
 
				 	return ret;
			
 
				 }
			
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	return btrfs_scrub_cancel(root);
			
 
				+	return btrfs_scrub_cancel(root->fs_info);
			
 
				 }
			
 
				 
			
 
				 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
			
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
			
 
				+{
			
 
				+	struct btrfs_ioctl_dev_replace_args *p;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!capable(CAP_SYS_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	p = memdup_user(arg, sizeof(*p));
			
 
				+	if (IS_ERR(p))
			
 
				+		return PTR_ERR(p);
			
 
				+
			
 
				+	switch (p->cmd) {
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
			
 
				+		if (atomic_xchg(
			
 
				+			&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			1)) {
			
 
				+			pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
			
 
				+			ret = -EINPROGRESS;
			
 
				+		} else {
			
 
				+			ret = btrfs_dev_replace_start(root, p);
			
 
				+			atomic_set(
			
 
				+			 &root->fs_info->mutually_exclusive_operation_running,
			
 
				+			 0);
			
 
				+		}
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
			
 
				+		btrfs_dev_replace_status(root->fs_info, p);
			
 
				+		ret = 0;
			
 
				+		break;
			
 
				+	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
			
 
				+		ret = btrfs_dev_replace_cancel(root->fs_info, p);
			
 
				+		break;
			
 
				+	default:
			
 
				+		ret = -EINVAL;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	if (copy_to_user(arg, p, sizeof(*p)))
			
 
				+		ret = -EFAULT;
			
 
				+
			
 
				+	kfree(p);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
			
 
				 {
			
 
				 	int ret = 0;
			
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 
				 	struct btrfs_ioctl_balance_args *bargs;
			
 
				 	struct btrfs_balance_control *bctl;
			
 
				 	int ret;
			
 
				+	int need_to_clear_lock = 0;
			
 
				 
			
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 
				 		bargs = NULL;
			
 
				 	}
			
 
				 
			
 
				-	if (fs_info->balance_ctl) {
			
 
				+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			1)) {
			
 
				+		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
			
 
				 		ret = -EINPROGRESS;
			
 
				 		goto out_bargs;
			
 
				 	}
			
 
				+	need_to_clear_lock = 1;
			
 
				 
			
 
				 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
			
 
				 	if (!bctl) {
			
@@ -3387,6 +3514,9 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 
				 out_bargs:
			
 
				 	kfree(bargs);
			
 
				 out:
			
 
				+	if (need_to_clear_lock)
			
 
				+		atomic_set(&root->fs_info->mutually_exclusive_operation_running,
			
 
				+			   0);
			
 
				 	mutex_unlock(&fs_info->balance_mutex);
			
 
				 	mutex_unlock(&fs_info->volume_mutex);
			
 
				 	mnt_drop_write_file(file);
			
@@ -3441,8 +3571,9 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
			
 
				+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
			
 
				 {
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_quota_ctl_args *sa;
			
 
				 	struct btrfs_trans_handle *trans = NULL;
			
 
				 	int ret;
			
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	if (root->fs_info->sb->s_flags & MS_RDONLY)
			
 
				-		return -EROFS;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	sa = memdup_user(arg, sizeof(*sa));
			
 
				-	if (IS_ERR(sa))
			
 
				-		return PTR_ERR(sa);
			
 
				+	if (IS_ERR(sa)) {
			
 
				+		ret = PTR_ERR(sa);
			
 
				+		goto drop_write;
			
 
				+	}
			
 
				 
			
 
				 	if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
			
 
				 		trans = btrfs_start_transaction(root, 2);
			
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
 
				 		if (err && !ret)
			
 
				 			ret = err;
			
 
				 	}
			
 
				-
			
 
				 out:
			
 
				 	kfree(sa);
			
 
				+drop_write:
			
 
				+	mnt_drop_write_file(file);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
			
 
				+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
			
 
				 {
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_qgroup_assign_args *sa;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	int ret;
			
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	if (root->fs_info->sb->s_flags & MS_RDONLY)
			
 
				-		return -EROFS;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	sa = memdup_user(arg, sizeof(*sa));
			
 
				-	if (IS_ERR(sa))
			
 
				-		return PTR_ERR(sa);
			
 
				+	if (IS_ERR(sa)) {
			
 
				+		ret = PTR_ERR(sa);
			
 
				+		goto drop_write;
			
 
				+	}
			
 
				 
			
 
				 	trans = btrfs_join_transaction(root);
			
 
				 	if (IS_ERR(trans)) {
			
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
 
				 
			
 
				 out:
			
 
				 	kfree(sa);
			
 
				+drop_write:
			
 
				+	mnt_drop_write_file(file);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
			
 
				+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
			
 
				 {
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_qgroup_create_args *sa;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	int ret;
			
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	if (root->fs_info->sb->s_flags & MS_RDONLY)
			
 
				-		return -EROFS;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	sa = memdup_user(arg, sizeof(*sa));
			
 
				-	if (IS_ERR(sa))
			
 
				-		return PTR_ERR(sa);
			
 
				+	if (IS_ERR(sa)) {
			
 
				+		ret = PTR_ERR(sa);
			
 
				+		goto drop_write;
			
 
				+	}
			
 
				 
			
 
				 	trans = btrfs_join_transaction(root);
			
 
				 	if (IS_ERR(trans)) {
			
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
 
				 
			
 
				 out:
			
 
				 	kfree(sa);
			
 
				+drop_write:
			
 
				+	mnt_drop_write_file(file);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
			
 
				+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
			
 
				 {
			
 
				+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_qgroup_limit_args *sa;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	int ret;
			
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
 
				 	if (!capable(CAP_SYS_ADMIN))
			
 
				 		return -EPERM;
			
 
				 
			
 
				-	if (root->fs_info->sb->s_flags & MS_RDONLY)
			
 
				-		return -EROFS;
			
 
				+	ret = mnt_want_write_file(file);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	sa = memdup_user(arg, sizeof(*sa));
			
 
				-	if (IS_ERR(sa))
			
 
				-		return PTR_ERR(sa);
			
 
				+	if (IS_ERR(sa)) {
			
 
				+		ret = PTR_ERR(sa);
			
 
				+		goto drop_write;
			
 
				+	}
			
 
				 
			
 
				 	trans = btrfs_join_transaction(root);
			
 
				 	if (IS_ERR(trans)) {
			
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
 
				 
			
 
				 out:
			
 
				 	kfree(sa);
			
 
				+drop_write:
			
 
				+	mnt_drop_write_file(file);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 
				 	case BTRFS_IOC_DEFRAG_RANGE:
			
 
				 		return btrfs_ioctl_defrag(file, argp);
			
 
				 	case BTRFS_IOC_RESIZE:
			
 
				-		return btrfs_ioctl_resize(root, argp);
			
 
				+		return btrfs_ioctl_resize(file, argp);
			
 
				 	case BTRFS_IOC_ADD_DEV:
			
 
				 		return btrfs_ioctl_add_dev(root, argp);
			
 
				 	case BTRFS_IOC_RM_DEV:
			
 
				-		return btrfs_ioctl_rm_dev(root, argp);
			
 
				+		return btrfs_ioctl_rm_dev(file, argp);
			
 
				 	case BTRFS_IOC_FS_INFO:
			
 
				 		return btrfs_ioctl_fs_info(root, argp);
			
 
				 	case BTRFS_IOC_DEV_INFO:
			
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 
				 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
			
 
				 		return 0;
			
 
				 	case BTRFS_IOC_START_SYNC:
			
 
				-		return btrfs_ioctl_start_sync(file, argp);
			
 
				+		return btrfs_ioctl_start_sync(root, argp);
			
 
				 	case BTRFS_IOC_WAIT_SYNC:
			
 
				-		return btrfs_ioctl_wait_sync(file, argp);
			
 
				+		return btrfs_ioctl_wait_sync(root, argp);
			
 
				 	case BTRFS_IOC_SCRUB:
			
 
				-		return btrfs_ioctl_scrub(root, argp);
			
 
				+		return btrfs_ioctl_scrub(file, argp);
			
 
				 	case BTRFS_IOC_SCRUB_CANCEL:
			
 
				 		return btrfs_ioctl_scrub_cancel(root, argp);
			
 
				 	case BTRFS_IOC_SCRUB_PROGRESS:
			
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
 
				 	case BTRFS_IOC_GET_DEV_STATS:
			
 
				 		return btrfs_ioctl_get_dev_stats(root, argp);
			
 
				 	case BTRFS_IOC_QUOTA_CTL:
			
 
				-		return btrfs_ioctl_quota_ctl(root, argp);
			
 
				+		return btrfs_ioctl_quota_ctl(file, argp);
			
 
				 	case BTRFS_IOC_QGROUP_ASSIGN:
			
 
				-		return btrfs_ioctl_qgroup_assign(root, argp);
			
 
				+		return btrfs_ioctl_qgroup_assign(file, argp);
			
 
				 	case BTRFS_IOC_QGROUP_CREATE:
			
 
				-		return btrfs_ioctl_qgroup_create(root, argp);
			
 
				+		return btrfs_ioctl_qgroup_create(file, argp);
			
 
				 	case BTRFS_IOC_QGROUP_LIMIT:
			
 
				-		return btrfs_ioctl_qgroup_limit(root, argp);
			
 
				+		return btrfs_ioctl_qgroup_limit(file, argp);
			
 
				+	case BTRFS_IOC_DEV_REPLACE:
			
 
				+		return btrfs_ioctl_dev_replace(root, argp);
			
 
				 	}
			
 
				 
			
 
				 	return -ENOTTY;
			
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
 
				 	char name[BTRFS_PATH_NAME_MAX + 1];
			
 
				 };
			
 
				 
			
 
				+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
			
 
				+
			
 
				 #define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
			
 
				 #define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
			
 
				 #define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
			
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
 
				 	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
			
 
				 };
			
 
				 
			
 
				-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
			
 
				+struct btrfs_ioctl_dev_replace_start_params {
			
 
				+	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */
			
 
				+	__u64 cont_reading_from_srcdev_mode;	/* in, see #define
			
 
				+						 * above */
			
 
				+	__u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */
			
 
				+	__u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */
			
 
				+};
			
 
				+
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4
			
 
				+struct btrfs_ioctl_dev_replace_status_params {
			
 
				+	__u64 replace_state;	/* out, see #define above */
			
 
				+	__u64 progress_1000;	/* out, 0 <= x <= 1000 */
			
 
				+	__u64 time_started;	/* out, seconds since 1-Jan-1970 */
			
 
				+	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */
			
 
				+	__u64 num_write_errors;	/* out */
			
 
				+	__u64 num_uncorrectable_read_errors;	/* out */
			
 
				+};
			
 
				+
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2
			
 
				+struct btrfs_ioctl_dev_replace_args {
			
 
				+	__u64 cmd;	/* in */
			
 
				+	__u64 result;	/* out */
			
 
				+
			
 
				+	union {
			
 
				+		struct btrfs_ioctl_dev_replace_start_params start;
			
 
				+		struct btrfs_ioctl_dev_replace_status_params status;
			
 
				+	};	/* in/out */
			
 
				+
			
 
				+	__u64 spare[64];
			
 
				+};
			
 
				+
			
 
				 struct btrfs_ioctl_dev_info_args {
			
 
				 	__u64 devid;				/* in/out */
			
 
				 	__u8 uuid[BTRFS_UUID_SIZE];		/* in/out */
			
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
 
				 			       struct btrfs_ioctl_qgroup_limit_args)
			
 
				 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
			
 
				 				      struct btrfs_ioctl_get_dev_stats)
			
 
				+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
			
 
				+				    struct btrfs_ioctl_dev_replace_args)
			
 
				+
			
 
				 #endif
			
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
 
				+
			
 
				+/*
			
 
				+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
			
 
				+ * Written by Miao Xie <miaox@cn.fujitsu.com>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public
			
 
				+ * License v2 as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public
			
 
				+ * License along with this program; if not, write to the
			
 
				+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
			
 
				+ * Boston, MA 021110-1307, USA.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BTRFS_MATH_H
			
 
				+#define __BTRFS_MATH_H
			
 
				+
			
 
				+#include <asm/div64.h>
			
 
				+
			
 
				+static inline u64 div_factor(u64 num, int factor)
			
 
				+{
			
 
				+	if (factor == 10)
			
 
				+		return num;
			
 
				+	num *= factor;
			
 
				+	do_div(num, 10);
			
 
				+	return num;
			
 
				+}
			
 
				+
			
 
				+static inline u64 div_factor_fine(u64 num, int factor)
			
 
				+{
			
 
				+	if (factor == 100)
			
 
				+		return num;
			
 
				+	num *= factor;
			
 
				+	do_div(num, 100);
			
 
				+	return num;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
				 	init_waitqueue_head(&entry->wait);
			
 
				 	INIT_LIST_HEAD(&entry->list);
			
 
				 	INIT_LIST_HEAD(&entry->root_extent_list);
			
 
				+	INIT_LIST_HEAD(&entry->work_list);
			
 
				+	init_completion(&entry->completion);
			
 
				 
			
 
				 	trace_btrfs_ordered_extent_add(inode, entry);
			
 
				 
			
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 
				 	wake_up(&entry->wait);
			
 
				 }
			
 
				 
			
 
				+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
			
 
				+{
			
 
				+	struct btrfs_ordered_extent *ordered;
			
 
				+
			
 
				+	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
			
 
				+	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
			
 
				+	complete(&ordered->completion);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * wait for all the ordered extents in a root.  This is done when balancing
			
 
				  * space between drives.
			
 
				  */
			
 
				 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
			
 
				 {
			
 
				-	struct list_head splice;
			
 
				+	struct list_head splice, works;
			
 
				 	struct list_head *cur;
			
 
				-	struct btrfs_ordered_extent *ordered;
			
 
				+	struct btrfs_ordered_extent *ordered, *next;
			
 
				 	struct inode *inode;
			
 
				 
			
 
				 	INIT_LIST_HEAD(&splice);
			
 
				+	INIT_LIST_HEAD(&works);
			
 
				 
			
 
				 	spin_lock(&root->fs_info->ordered_extent_lock);
			
 
				 	list_splice_init(&root->fs_info->ordered_extents, &splice);
			
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 
				 		spin_unlock(&root->fs_info->ordered_extent_lock);
			
 
				 
			
 
				 		if (inode) {
			
 
				-			btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				-			btrfs_put_ordered_extent(ordered);
			
 
				-			if (delay_iput)
			
 
				-				btrfs_add_delayed_iput(inode);
			
 
				-			else
			
 
				-				iput(inode);
			
 
				+			ordered->flush_work.func = btrfs_run_ordered_extent_work;
			
 
				+			list_add_tail(&ordered->work_list, &works);
			
 
				+			btrfs_queue_worker(&root->fs_info->flush_workers,
			
 
				+					   &ordered->flush_work);
			
 
				 		} else {
			
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 		}
			
 
				 
			
 
				+		cond_resched();
			
 
				 		spin_lock(&root->fs_info->ordered_extent_lock);
			
 
				 	}
			
 
				 	spin_unlock(&root->fs_info->ordered_extent_lock);
			
 
				+
			
 
				+	list_for_each_entry_safe(ordered, next, &works, work_list) {
			
 
				+		list_del_init(&ordered->work_list);
			
 
				+		wait_for_completion(&ordered->completion);
			
 
				+
			
 
				+		inode = ordered->inode;
			
 
				+		btrfs_put_ordered_extent(ordered);
			
 
				+		if (delay_iput)
			
 
				+			btrfs_add_delayed_iput(inode);
			
 
				+		else
			
 
				+			iput(inode);
			
 
				+
			
 
				+		cond_resched();
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 
				  * extra check to make sure the ordered operation list really is empty
			
 
				  * before we return
			
 
				  */
			
 
				-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
			
 
				+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
			
 
				 {
			
 
				 	struct btrfs_inode *btrfs_inode;
			
 
				 	struct inode *inode;
			
 
				 	struct list_head splice;
			
 
				+	struct list_head works;
			
 
				+	struct btrfs_delalloc_work *work, *next;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	INIT_LIST_HEAD(&splice);
			
 
				+	INIT_LIST_HEAD(&works);
			
 
				 
			
 
				 	mutex_lock(&root->fs_info->ordered_operations_mutex);
			
 
				 	spin_lock(&root->fs_info->ordered_extent_lock);
			
@@ -533,6 +562,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 
				 	list_splice_init(&root->fs_info->ordered_operations, &splice);
			
 
				 
			
 
				 	while (!list_empty(&splice)) {
			
 
				+
			
 
				 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
			
 
				 				   ordered_operations);
			
 
				 
			
@@ -549,15 +579,26 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 
				 			list_add_tail(&BTRFS_I(inode)->ordered_operations,
			
 
				 			      &root->fs_info->ordered_operations);
			
 
				 		}
			
 
				+
			
 
				+		if (!inode)
			
 
				+			continue;
			
 
				 		spin_unlock(&root->fs_info->ordered_extent_lock);
			
 
				 
			
 
				-		if (inode) {
			
 
				-			if (wait)
			
 
				-				btrfs_wait_ordered_range(inode, 0, (u64)-1);
			
 
				-			else
			
 
				-				filemap_flush(inode->i_mapping);
			
 
				-			btrfs_add_delayed_iput(inode);
			
 
				+		work = btrfs_alloc_delalloc_work(inode, wait, 1);
			
 
				+		if (!work) {
			
 
				+			if (list_empty(&BTRFS_I(inode)->ordered_operations))
			
 
				+				list_add_tail(&btrfs_inode->ordered_operations,
			
 
				+					      &splice);
			
 
				+			spin_lock(&root->fs_info->ordered_extent_lock);
			
 
				+			list_splice_tail(&splice,
			
 
				+					 &root->fs_info->ordered_operations);
			
 
				+			spin_unlock(&root->fs_info->ordered_extent_lock);
			
 
				+			ret = -ENOMEM;
			
 
				+			goto out;
			
 
				 		}
			
 
				+		list_add_tail(&work->list, &works);
			
 
				+		btrfs_queue_worker(&root->fs_info->flush_workers,
			
 
				+				   &work->work);
			
 
				 
			
 
				 		cond_resched();
			
 
				 		spin_lock(&root->fs_info->ordered_extent_lock);
			
@@ -566,7 +607,13 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 
				 		goto again;
			
 
				 
			
 
				 	spin_unlock(&root->fs_info->ordered_extent_lock);
			
 
				+out:
			
 
				+	list_for_each_entry_safe(work, next, &works, list) {
			
 
				+		list_del_init(&work->list);
			
 
				+		btrfs_wait_and_free_delalloc_work(work);
			
 
				+	}
			
 
				 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 
				 	u64 end;
			
 
				 	u64 orig_end;
			
 
				 	struct btrfs_ordered_extent *ordered;
			
 
				-	int found;
			
 
				 
			
 
				 	if (start + len < start) {
			
 
				 		orig_end = INT_LIMIT(loff_t);
			
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 
				 	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
			
 
				 
			
 
				 	end = orig_end;
			
 
				-	found = 0;
			
 
				 	while (1) {
			
 
				 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
			
 
				 		if (!ordered)
			
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 			break;
			
 
				 		}
			
 
				-		found++;
			
 
				 		btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				 		end = ordered->file_offset;
			
 
				 		btrfs_put_ordered_extent(ordered);
			
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 
				 	if (last_mod < root->fs_info->last_trans_committed)
			
 
				 		return;
			
 
				 
			
 
				-	/*
			
 
				-	 * the transaction is already committing.  Just start the IO and
			
 
				-	 * don't bother with all of this list nonsense
			
 
				-	 */
			
 
				-	if (trans && root->fs_info->running_transaction->blocked) {
			
 
				-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				 	spin_lock(&root->fs_info->ordered_extent_lock);
			
 
				 	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
			
 
				 		list_add_tail(&BTRFS_I(inode)->ordered_operations,
			
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
 
				 				     NULL);
			
 
				 	if (!btrfs_ordered_extent_cache)
			
 
				 		return -ENOMEM;
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
 
				 	struct list_head root_extent_list;
			
 
				 
			
 
				 	struct btrfs_work work;
			
 
				-};
			
 
				 
			
 
				+	struct completion completion;
			
 
				+	struct btrfs_work flush_work;
			
 
				+	struct list_head work_list;
			
 
				+};
			
 
				 
			
 
				 /*
			
 
				  * calculates the total size you need to allocate for an ordered sum
			
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 
				 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
			
 
				 				struct btrfs_ordered_extent *ordered);
			
 
				 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
			
 
				-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
			
 
				+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
			
 
				 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
			
 
				 				 struct btrfs_root *root,
			
 
				 				 struct inode *inode);
			
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 
				 		case BTRFS_DEV_STATS_KEY:
			
 
				 			printk(KERN_INFO "\t\tdevice stats\n");
			
 
				 			break;
			
 
				+		case BTRFS_DEV_REPLACE_KEY:
			
 
				+			printk(KERN_INFO "\t\tdev replace\n");
			
 
				+			break;
			
 
				 		};
			
 
				 	}
			
 
				 }
			
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
 
				 #include "volumes.h"
			
 
				 #include "disk-io.h"
			
 
				 #include "transaction.h"
			
 
				+#include "dev-replace.h"
			
 
				 
			
 
				 #undef DEBUG
			
 
				 
			
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 	struct reada_extent *re = NULL;
			
 
				 	struct reada_extent *re_exist = NULL;
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
			
 
				 	struct btrfs_bio *bbio = NULL;
			
 
				 	struct btrfs_device *dev;
			
 
				 	struct btrfs_device *prev_dev;
			
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 	int nzones = 0;
			
 
				 	int i;
			
 
				 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
			
 
				+	int dev_replace_is_ongoing;
			
 
				 
			
 
				 	spin_lock(&fs_info->reada_lock);
			
 
				 	re = radix_tree_lookup(&fs_info->reada_tree, index);
			
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 	 * map block
			
 
				 	 */
			
 
				 	length = blocksize;
			
 
				-	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
			
 
				+	ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
			
 
				+			      &bbio, 0);
			
 
				 	if (ret || !bbio || length < blocksize)
			
 
				 		goto error;
			
 
				 
			
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 	}
			
 
				 
			
 
				 	/* insert extent in reada_tree + all per-device trees, all or nothing */
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				 	spin_lock(&fs_info->reada_lock);
			
 
				 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
			
 
				 	if (ret == -EEXIST) {
			
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 		BUG_ON(!re_exist);
			
 
				 		re_exist->refcnt++;
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				 		goto error;
			
 
				 	}
			
 
				 	if (ret) {
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				 		goto error;
			
 
				 	}
			
 
				 	prev_dev = NULL;
			
 
				+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
			
 
				+			&fs_info->dev_replace);
			
 
				 	for (i = 0; i < nzones; ++i) {
			
 
				 		dev = bbio->stripes[i].dev;
			
 
				 		if (dev == prev_dev) {
			
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 			 */
			
 
				 			continue;
			
 
				 		}
			
 
				+		if (!dev->bdev) {
			
 
				+			/* cannot read ahead on missing device */
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (dev_replace_is_ongoing &&
			
 
				+		    dev == fs_info->dev_replace.tgtdev) {
			
 
				+			/*
			
 
				+			 * as this device is selected for reading only as
			
 
				+			 * a last resort, skip it for read ahead.
			
 
				+			 */
			
 
				+			continue;
			
 
				+		}
			
 
				 		prev_dev = dev;
			
 
				 		ret = radix_tree_insert(&dev->reada_extents, index, re);
			
 
				 		if (ret) {
			
 
				 			while (--i >= 0) {
			
 
				 				dev = bbio->stripes[i].dev;
			
 
				 				BUG_ON(dev == NULL);
			
 
				+				/* ignore whether the entry was inserted */
			
 
				 				radix_tree_delete(&dev->reada_extents, index);
			
 
				 			}
			
 
				 			BUG_ON(fs_info == NULL);
			
 
				 			radix_tree_delete(&fs_info->reada_tree, index);
			
 
				 			spin_unlock(&fs_info->reada_lock);
			
 
				+			btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				 			goto error;
			
 
				 		}
			
 
				 	}
			
 
				 	spin_unlock(&fs_info->reada_lock);
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				 
			
 
				 	kfree(bbio);
			
 
				 	return re;
			
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 
				 	generation = btrfs_header_generation(node);
			
 
				 	free_extent_buffer(node);
			
 
				 
			
 
				-	reada_add_block(rc, start, &max_key, level, generation);
			
 
				+	if (reada_add_block(rc, start, &max_key, level, generation)) {
			
 
				+		kfree(rc);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				 
			
 
				 	reada_start_machine(root->fs_info);
			
 
				 
			
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
				 	struct btrfs_root_item *root_item;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct extent_buffer *leaf;
			
 
				-	unsigned long nr;
			
 
				 	int level;
			
 
				 	int max_level;
			
 
				 	int replaced = 0;
			
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
				 		BUG_ON(IS_ERR(trans));
			
 
				 		trans->block_rsv = rc->block_rsv;
			
 
				 
			
 
				-		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
			
 
				+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
			
 
				+					     BTRFS_RESERVE_FLUSH_ALL);
			
 
				 		if (ret) {
			
 
				 			BUG_ON(ret != -EAGAIN);
			
 
				 			ret = btrfs_commit_transaction(trans, root);
			
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
				 			       path->slots[level]);
			
 
				 		root_item->drop_level = level;
			
 
				 
			
 
				-		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction_throttle(trans, root);
			
 
				 
			
 
				-		btrfs_btree_balance_dirty(root, nr);
			
 
				+		btrfs_btree_balance_dirty(root);
			
 
				 
			
 
				 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
			
 
				 			invalidate_extent_cache(root, &key, &next_key);
			
@@ -2155,10 +2154,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
				 		btrfs_update_reloc_root(trans, root);
			
 
				 	}
			
 
				 
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction_throttle(trans, root);
			
 
				 
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 
			
 
				 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
			
 
				 		invalidate_extent_cache(root, &key, &next_key);
			
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 
				 again:
			
 
				 	if (!err) {
			
 
				 		num_bytes = rc->merging_rsv_size;
			
 
				-		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
			
 
				+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
			
 
				+					  BTRFS_RESERVE_FLUSH_ALL);
			
 
				 		if (ret)
			
 
				 			err = ret;
			
 
				 	}
			
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 
				 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
			
 
				 
			
 
				 	trans->block_rsv = rc->block_rsv;
			
 
				-	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
			
 
				+	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
			
 
				+				  BTRFS_RESERVE_FLUSH_ALL);
			
 
				 	if (ret) {
			
 
				 		if (ret == -EAGAIN)
			
 
				 			rc->commit_transaction = 1;
			
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_root *root = fs_info->tree_root;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				-	unsigned long nr;
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	if (inode)
			
@@ -3293,9 +3292,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 
				 	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
			
 
				 
			
 
				 	btrfs_free_path(path);
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 out:
			
 
				 	iput(inode);
			
 
				 	return ret;
			
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
 
				 	 * is no reservation in transaction handle.
			
 
				 	 */
			
 
				 	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
			
 
				-				  rc->extent_root->nodesize * 256);
			
 
				+				  rc->extent_root->nodesize * 256,
			
 
				+				  BTRFS_RESERVE_FLUSH_ALL);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 	struct btrfs_trans_handle *trans = NULL;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_extent_item *ei;
			
 
				-	unsigned long nr;
			
 
				 	u64 flags;
			
 
				 	u32 item_size;
			
 
				 	int ret;
			
@@ -3828,9 +3826,8 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 			ret = btrfs_commit_transaction(trans, rc->extent_root);
			
 
				 			BUG_ON(ret);
			
 
				 		} else {
			
 
				-			nr = trans->blocks_used;
			
 
				 			btrfs_end_transaction_throttle(trans, rc->extent_root);
			
 
				-			btrfs_btree_balance_dirty(rc->extent_root, nr);
			
 
				+			btrfs_btree_balance_dirty(rc->extent_root);
			
 
				 		}
			
 
				 		trans = NULL;
			
 
				 
			
@@ -3860,9 +3857,8 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 			  GFP_NOFS);
			
 
				 
			
 
				 	if (trans) {
			
 
				-		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction_throttle(trans, rc->extent_root);
			
 
				-		btrfs_btree_balance_dirty(rc->extent_root, nr);
			
 
				+		btrfs_btree_balance_dirty(rc->extent_root);
			
 
				 	}
			
 
				 
			
 
				 	if (!err) {
			
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct btrfs_root *root;
			
 
				 	struct btrfs_key key;
			
 
				-	unsigned long nr;
			
 
				 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
			
 
				 	int err = 0;
			
 
				 
			
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 
				 
			
 
				 	err = btrfs_orphan_add(trans, inode);
			
 
				 out:
			
 
				-	nr = trans->blocks_used;
			
 
				 	btrfs_end_transaction(trans, root);
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				+	btrfs_btree_balance_dirty(root);
			
 
				 	if (err) {
			
 
				 		if (inode)
			
 
				 			iput(inode);
			
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 
				 	       (unsigned long long)rc->block_group->key.objectid,
			
 
				 	       (unsigned long long)rc->block_group->flags);
			
 
				 
			
 
				-	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
			
 
				+	ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
			
 
				+	if (ret < 0) {
			
 
				+		err = ret;
			
 
				+		goto out;
			
 
				+	}
			
 
				 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
			
 
				 
			
 
				 	while (1) {
			
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_root_item *item = &root->root_item;
			
 
				 	struct timespec ct = CURRENT_TIME;
			
 
				 
			
 
				-	spin_lock(&root->root_times_lock);
			
 
				+	spin_lock(&root->root_item_lock);
			
 
				 	item->ctransid = cpu_to_le64(trans->transid);
			
 
				 	item->ctime.sec = cpu_to_le64(ct.tv_sec);
			
 
				 	item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
			
 
				-	spin_unlock(&root->root_times_lock);
			
 
				+	spin_unlock(&root->root_item_lock);
			
 
				 }
			
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
 
				 /*
			
 
				- * Copyright (C) 2011 STRATO.  All rights reserved.
			
 
				+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or
			
 
				  * modify it under the terms of the GNU General Public
			
@@ -25,6 +25,7 @@
 
				 #include "transaction.h"
			
 
				 #include "backref.h"
			
 
				 #include "extent_io.h"
			
 
				+#include "dev-replace.h"
			
 
				 #include "check-integrity.h"
			
 
				 #include "rcu-string.h"
			
 
				 
			
@@ -42,10 +43,23 @@
 
				  */
			
 
				 
			
 
				 struct scrub_block;
			
 
				-struct scrub_dev;
			
 
				+struct scrub_ctx;
			
 
				 
			
 
				-#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
			
 
				-#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
			
 
				+/*
			
 
				+ * the following three values only influence the performance.
			
 
				+ * The last one configures the number of parallel and outstanding I/O
			
 
				+ * operations. The first two values configure an upper limit for the number
			
 
				+ * of (dynamically allocated) pages that are added to a bio.
			
 
				+ */
			
 
				+#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
			
 
				+#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
			
 
				+#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
			
 
				+
			
 
				+/*
			
 
				+ * the following value times PAGE_SIZE needs to be large enough to match the
			
 
				+ * largest node/leaf/sector size that shall be supported.
			
 
				+ * Values larger than BTRFS_STRIPE_LEN are not supported.
			
 
				+ */
			
 
				 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
			
 
				 
			
 
				 struct scrub_page {
			
@@ -56,6 +70,8 @@ struct scrub_page {
 
				 	u64			generation;
			
 
				 	u64			logical;
			
 
				 	u64			physical;
			
 
				+	u64			physical_for_dev_replace;
			
 
				+	atomic_t		ref_count;
			
 
				 	struct {
			
 
				 		unsigned int	mirror_num:8;
			
 
				 		unsigned int	have_csum:1;
			
@@ -66,23 +82,28 @@ struct scrub_page {
 
				 
			
 
				 struct scrub_bio {
			
 
				 	int			index;
			
 
				-	struct scrub_dev	*sdev;
			
 
				+	struct scrub_ctx	*sctx;
			
 
				+	struct btrfs_device	*dev;
			
 
				 	struct bio		*bio;
			
 
				 	int			err;
			
 
				 	u64			logical;
			
 
				 	u64			physical;
			
 
				-	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
			
 
				+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
			
 
				+	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
			
 
				+#else
			
 
				+	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
			
 
				+#endif
			
 
				 	int			page_count;
			
 
				 	int			next_free;
			
 
				 	struct btrfs_work	work;
			
 
				 };
			
 
				 
			
 
				 struct scrub_block {
			
 
				-	struct scrub_page	pagev[SCRUB_MAX_PAGES_PER_BLOCK];
			
 
				+	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
			
 
				 	int			page_count;
			
 
				 	atomic_t		outstanding_pages;
			
 
				 	atomic_t		ref_count; /* free mem on transition to zero */
			
 
				-	struct scrub_dev	*sdev;
			
 
				+	struct scrub_ctx	*sctx;
			
 
				 	struct {
			
 
				 		unsigned int	header_error:1;
			
 
				 		unsigned int	checksum_error:1;
			
@@ -91,23 +112,35 @@ struct scrub_block {
 
				 	};
			
 
				 };
			
 
				 
			
 
				-struct scrub_dev {
			
 
				-	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
			
 
				-	struct btrfs_device	*dev;
			
 
				+struct scrub_wr_ctx {
			
 
				+	struct scrub_bio *wr_curr_bio;
			
 
				+	struct btrfs_device *tgtdev;
			
 
				+	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
			
 
				+	atomic_t flush_all_writes;
			
 
				+	struct mutex wr_lock;
			
 
				+};
			
 
				+
			
 
				+struct scrub_ctx {
			
 
				+	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
			
 
				+	struct btrfs_root	*dev_root;
			
 
				 	int			first_free;
			
 
				 	int			curr;
			
 
				-	atomic_t		in_flight;
			
 
				-	atomic_t		fixup_cnt;
			
 
				+	atomic_t		bios_in_flight;
			
 
				+	atomic_t		workers_pending;
			
 
				 	spinlock_t		list_lock;
			
 
				 	wait_queue_head_t	list_wait;
			
 
				 	u16			csum_size;
			
 
				 	struct list_head	csum_list;
			
 
				 	atomic_t		cancel_req;
			
 
				 	int			readonly;
			
 
				-	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
			
 
				+	int			pages_per_rd_bio;
			
 
				 	u32			sectorsize;
			
 
				 	u32			nodesize;
			
 
				 	u32			leafsize;
			
 
				+
			
 
				+	int			is_dev_replace;
			
 
				+	struct scrub_wr_ctx	wr_ctx;
			
 
				+
			
 
				 	/*
			
 
				 	 * statistics
			
 
				 	 */
			
@@ -116,13 +149,23 @@ struct scrub_dev {
 
				 };
			
 
				 
			
 
				 struct scrub_fixup_nodatasum {
			
 
				-	struct scrub_dev	*sdev;
			
 
				+	struct scrub_ctx	*sctx;
			
 
				+	struct btrfs_device	*dev;
			
 
				 	u64			logical;
			
 
				 	struct btrfs_root	*root;
			
 
				 	struct btrfs_work	work;
			
 
				 	int			mirror_num;
			
 
				 };
			
 
				 
			
 
				+struct scrub_copy_nocow_ctx {
			
 
				+	struct scrub_ctx	*sctx;
			
 
				+	u64			logical;
			
 
				+	u64			len;
			
 
				+	int			mirror_num;
			
 
				+	u64			physical_for_dev_replace;
			
 
				+	struct btrfs_work	work;
			
 
				+};
			
 
				+
			
 
				 struct scrub_warning {
			
 
				 	struct btrfs_path	*path;
			
 
				 	u64			extent_item_size;
			
@@ -137,15 +180,20 @@ struct scrub_warning {
 
				 };
			
 
				 
			
 
				 
			
 
				+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
			
 
				+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
			
 
				+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
			
 
				+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
			
 
				 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
			
 
				-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
			
 
				-				     struct btrfs_mapping_tree *map_tree,
			
 
				+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
			
 
				+				     struct btrfs_fs_info *fs_info,
			
 
				+				     struct scrub_block *original_sblock,
			
 
				 				     u64 length, u64 logical,
			
 
				-				     struct scrub_block *sblock);
			
 
				-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
			
 
				-			       struct scrub_block *sblock, int is_metadata,
			
 
				-			       int have_csum, u8 *csum, u64 generation,
			
 
				-			       u16 csum_size);
			
 
				+				     struct scrub_block *sblocks_for_recheck);
			
 
				+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
			
 
				+				struct scrub_block *sblock, int is_metadata,
			
 
				+				int have_csum, u8 *csum, u64 generation,
			
 
				+				u16 csum_size);
			
 
				 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
			
 
				 					 struct scrub_block *sblock,
			
 
				 					 int is_metadata, int have_csum,
			
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 
				 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
			
 
				 					    struct scrub_block *sblock_good,
			
 
				 					    int page_num, int force_write);
			
 
				+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
			
 
				+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
			
 
				+					   int page_num);
			
 
				 static int scrub_checksum_data(struct scrub_block *sblock);
			
 
				 static int scrub_checksum_tree_block(struct scrub_block *sblock);
			
 
				 static int scrub_checksum_super(struct scrub_block *sblock);
			
 
				 static void scrub_block_get(struct scrub_block *sblock);
			
 
				 static void scrub_block_put(struct scrub_block *sblock);
			
 
				-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
			
 
				-				 struct scrub_page *spage);
			
 
				-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
			
 
				-		       u64 physical, u64 flags, u64 gen, int mirror_num,
			
 
				-		       u8 *csum, int force);
			
 
				+static void scrub_page_get(struct scrub_page *spage);
			
 
				+static void scrub_page_put(struct scrub_page *spage);
			
 
				+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
			
 
				+				    struct scrub_page *spage);
			
 
				+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
			
 
				+		       u64 physical, struct btrfs_device *dev, u64 flags,
			
 
				+		       u64 gen, int mirror_num, u8 *csum, int force,
			
 
				+		       u64 physical_for_dev_replace);
			
 
				 static void scrub_bio_end_io(struct bio *bio, int err);
			
 
				 static void scrub_bio_end_io_worker(struct btrfs_work *work);
			
 
				 static void scrub_block_complete(struct scrub_block *sblock);
			
 
				+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
			
 
				+			       u64 extent_logical, u64 extent_len,
			
 
				+			       u64 *extent_physical,
			
 
				+			       struct btrfs_device **extent_dev,
			
 
				+			       int *extent_mirror_num);
			
 
				+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
			
 
				+			      struct scrub_wr_ctx *wr_ctx,
			
 
				+			      struct btrfs_fs_info *fs_info,
			
 
				+			      struct btrfs_device *dev,
			
 
				+			      int is_dev_replace);
			
 
				+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
			
 
				+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
			
 
				+				    struct scrub_page *spage);
			
 
				+static void scrub_wr_submit(struct scrub_ctx *sctx);
			
 
				+static void scrub_wr_bio_end_io(struct bio *bio, int err);
			
 
				+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
			
 
				+static int write_page_nocow(struct scrub_ctx *sctx,
			
 
				+			    u64 physical_for_dev_replace, struct page *page);
			
 
				+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
			
 
				+				      void *ctx);
			
 
				+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
			
 
				+			    int mirror_num, u64 physical_for_dev_replace);
			
 
				+static void copy_nocow_pages_worker(struct btrfs_work *work);
			
 
				+
			
 
				+
			
 
				+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
			
 
				+{
			
 
				+	atomic_inc(&sctx->bios_in_flight);
			
 
				+}
			
 
				+
			
 
				+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
			
 
				+{
			
 
				+	atomic_dec(&sctx->bios_in_flight);
			
 
				+	wake_up(&sctx->list_wait);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * used for workers that require transaction commits (i.e., for the
			
 
				+ * NOCOW case)
			
 
				+ */
			
 
				+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
			
 
				+{
			
 
				+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
			
 
				+
			
 
				+	/*
			
 
				+	 * increment scrubs_running to prevent cancel requests from
			
 
				+	 * completing as long as a worker is running. we must also
			
 
				+	 * increment scrubs_paused to prevent deadlocking on pause
			
 
				+	 * requests used for transactions commits (as the worker uses a
			
 
				+	 * transaction context). it is safe to regard the worker
			
 
				+	 * as paused for all matters practical. effectively, we only
			
 
				+	 * avoid cancellation requests from completing.
			
 
				+	 */
			
 
				+	mutex_lock(&fs_info->scrub_lock);
			
 
				+	atomic_inc(&fs_info->scrubs_running);
			
 
				+	atomic_inc(&fs_info->scrubs_paused);
			
 
				+	mutex_unlock(&fs_info->scrub_lock);
			
 
				+	atomic_inc(&sctx->workers_pending);
			
 
				+}
			
 
				 
			
 
				+/* used for workers that require transaction commits */
			
 
				+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
			
 
				+{
			
 
				+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
			
 
				 
			
 
				-static void scrub_free_csums(struct scrub_dev *sdev)
			
 
				+	/*
			
 
				+	 * see scrub_pending_trans_workers_inc() why we're pretending
			
 
				+	 * to be paused in the scrub counters
			
 
				+	 */
			
 
				+	mutex_lock(&fs_info->scrub_lock);
			
 
				+	atomic_dec(&fs_info->scrubs_running);
			
 
				+	atomic_dec(&fs_info->scrubs_paused);
			
 
				+	mutex_unlock(&fs_info->scrub_lock);
			
 
				+	atomic_dec(&sctx->workers_pending);
			
 
				+	wake_up(&fs_info->scrub_pause_wait);
			
 
				+	wake_up(&sctx->list_wait);
			
 
				+}
			
 
				+
			
 
				+static void scrub_free_csums(struct scrub_ctx *sctx)
			
 
				 {
			
 
				-	while (!list_empty(&sdev->csum_list)) {
			
 
				+	while (!list_empty(&sctx->csum_list)) {
			
 
				 		struct btrfs_ordered_sum *sum;
			
 
				-		sum = list_first_entry(&sdev->csum_list,
			
 
				+		sum = list_first_entry(&sctx->csum_list,
			
 
				 				       struct btrfs_ordered_sum, list);
			
 
				 		list_del(&sum->list);
			
 
				 		kfree(sum);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
			
 
				+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
			
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				-	if (!sdev)
			
 
				+	if (!sctx)
			
 
				 		return;
			
 
				 
			
 
				+	scrub_free_wr_ctx(&sctx->wr_ctx);
			
 
				+
			
 
				 	/* this can happen when scrub is cancelled */
			
 
				-	if (sdev->curr != -1) {
			
 
				-		struct scrub_bio *sbio = sdev->bios[sdev->curr];
			
 
				+	if (sctx->curr != -1) {
			
 
				+		struct scrub_bio *sbio = sctx->bios[sctx->curr];
			
 
				 
			
 
				 		for (i = 0; i < sbio->page_count; i++) {
			
 
				-			BUG_ON(!sbio->pagev[i]);
			
 
				-			BUG_ON(!sbio->pagev[i]->page);
			
 
				+			WARN_ON(!sbio->pagev[i]->page);
			
 
				 			scrub_block_put(sbio->pagev[i]->sblock);
			
 
				 		}
			
 
				 		bio_put(sbio->bio);
			
 
				 	}
			
 
				 
			
 
				-	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
			
 
				-		struct scrub_bio *sbio = sdev->bios[i];
			
 
				+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
			
 
				+		struct scrub_bio *sbio = sctx->bios[i];
			
 
				 
			
 
				 		if (!sbio)
			
 
				 			break;
			
 
				 		kfree(sbio);
			
 
				 	}
			
 
				 
			
 
				-	scrub_free_csums(sdev);
			
 
				-	kfree(sdev);
			
 
				+	scrub_free_csums(sctx);
			
 
				+	kfree(sctx);
			
 
				 }
			
 
				 
			
 
				 static noinline_for_stack
			
 
				-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
			
 
				+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
			
 
				 {
			
 
				-	struct scrub_dev *sdev;
			
 
				+	struct scrub_ctx *sctx;
			
 
				 	int		i;
			
 
				 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
			
 
				-	int pages_per_bio;
			
 
				+	int pages_per_rd_bio;
			
 
				+	int ret;
			
 
				 
			
 
				-	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
			
 
				-			      bio_get_nr_vecs(dev->bdev));
			
 
				-	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
			
 
				-	if (!sdev)
			
 
				+	/*
			
 
				+	 * the setting of pages_per_rd_bio is correct for scrub but might
			
 
				+	 * be wrong for the dev_replace code where we might read from
			
 
				+	 * different devices in the initial huge bios. However, that
			
 
				+	 * code is able to correctly handle the case when adding a page
			
 
				+	 * to a bio fails.
			
 
				+	 */
			
 
				+	if (dev->bdev)
			
 
				+		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
			
 
				+					 bio_get_nr_vecs(dev->bdev));
			
 
				+	else
			
 
				+		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
			
 
				+	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
			
 
				+	if (!sctx)
			
 
				 		goto nomem;
			
 
				-	sdev->dev = dev;
			
 
				-	sdev->pages_per_bio = pages_per_bio;
			
 
				-	sdev->curr = -1;
			
 
				-	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
			
 
				+	sctx->is_dev_replace = is_dev_replace;
			
 
				+	sctx->pages_per_rd_bio = pages_per_rd_bio;
			
 
				+	sctx->curr = -1;
			
 
				+	sctx->dev_root = dev->dev_root;
			
 
				+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
			
 
				 		struct scrub_bio *sbio;
			
 
				 
			
 
				 		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
			
 
				 		if (!sbio)
			
 
				 			goto nomem;
			
 
				-		sdev->bios[i] = sbio;
			
 
				+		sctx->bios[i] = sbio;
			
 
				 
			
 
				 		sbio->index = i;
			
 
				-		sbio->sdev = sdev;
			
 
				+		sbio->sctx = sctx;
			
 
				 		sbio->page_count = 0;
			
 
				 		sbio->work.func = scrub_bio_end_io_worker;
			
 
				 
			
 
				-		if (i != SCRUB_BIOS_PER_DEV-1)
			
 
				-			sdev->bios[i]->next_free = i + 1;
			
 
				+		if (i != SCRUB_BIOS_PER_SCTX - 1)
			
 
				+			sctx->bios[i]->next_free = i + 1;
			
 
				 		else
			
 
				-			sdev->bios[i]->next_free = -1;
			
 
				-	}
			
 
				-	sdev->first_free = 0;
			
 
				-	sdev->nodesize = dev->dev_root->nodesize;
			
 
				-	sdev->leafsize = dev->dev_root->leafsize;
			
 
				-	sdev->sectorsize = dev->dev_root->sectorsize;
			
 
				-	atomic_set(&sdev->in_flight, 0);
			
 
				-	atomic_set(&sdev->fixup_cnt, 0);
			
 
				-	atomic_set(&sdev->cancel_req, 0);
			
 
				-	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
			
 
				-	INIT_LIST_HEAD(&sdev->csum_list);
			
 
				-
			
 
				-	spin_lock_init(&sdev->list_lock);
			
 
				-	spin_lock_init(&sdev->stat_lock);
			
 
				-	init_waitqueue_head(&sdev->list_wait);
			
 
				-	return sdev;
			
 
				+			sctx->bios[i]->next_free = -1;
			
 
				+	}
			
 
				+	sctx->first_free = 0;
			
 
				+	sctx->nodesize = dev->dev_root->nodesize;
			
 
				+	sctx->leafsize = dev->dev_root->leafsize;
			
 
				+	sctx->sectorsize = dev->dev_root->sectorsize;
			
 
				+	atomic_set(&sctx->bios_in_flight, 0);
			
 
				+	atomic_set(&sctx->workers_pending, 0);
			
 
				+	atomic_set(&sctx->cancel_req, 0);
			
 
				+	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
			
 
				+	INIT_LIST_HEAD(&sctx->csum_list);
			
 
				+
			
 
				+	spin_lock_init(&sctx->list_lock);
			
 
				+	spin_lock_init(&sctx->stat_lock);
			
 
				+	init_waitqueue_head(&sctx->list_wait);
			
 
				+
			
 
				+	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
			
 
				+				 fs_info->dev_replace.tgtdev, is_dev_replace);
			
 
				+	if (ret) {
			
 
				+		scrub_free_ctx(sctx);
			
 
				+		return ERR_PTR(ret);
			
 
				+	}
			
 
				+	return sctx;
			
 
				 
			
 
				 nomem:
			
 
				-	scrub_free_dev(sdev);
			
 
				+	scrub_free_ctx(sctx);
			
 
				 	return ERR_PTR(-ENOMEM);
			
 
				 }
			
 
				 
			
 
				-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
			
 
				+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
			
 
				+				     void *warn_ctx)
			
 
				 {
			
 
				 	u64 isize;
			
 
				 	u32 nlink;
			
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 
				 	int i;
			
 
				 	struct extent_buffer *eb;
			
 
				 	struct btrfs_inode_item *inode_item;
			
 
				-	struct scrub_warning *swarn = ctx;
			
 
				+	struct scrub_warning *swarn = warn_ctx;
			
 
				 	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
			
 
				 	struct inode_fs_paths *ipath = NULL;
			
 
				 	struct btrfs_root *local_root;
			
@@ -345,8 +496,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 
				 
			
 
				 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
			
 
				 {
			
 
				-	struct btrfs_device *dev = sblock->sdev->dev;
			
 
				-	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
			
 
				+	struct btrfs_device *dev;
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_key found_key;
			
 
				 	struct extent_buffer *eb;
			
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 
				 	const int bufsize = 4096;
			
 
				 	int ret;
			
 
				 
			
 
				+	WARN_ON(sblock->page_count < 1);
			
 
				+	dev = sblock->pagev[0]->dev;
			
 
				+	fs_info = sblock->sctx->dev_root->fs_info;
			
 
				+
			
 
				 	path = btrfs_alloc_path();
			
 
				 
			
 
				 	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
			
 
				 	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
			
 
				-	BUG_ON(sblock->page_count < 1);
			
 
				-	swarn.sector = (sblock->pagev[0].physical) >> 9;
			
 
				-	swarn.logical = sblock->pagev[0].logical;
			
 
				+	swarn.sector = (sblock->pagev[0]->physical) >> 9;
			
 
				+	swarn.logical = sblock->pagev[0]->logical;
			
 
				 	swarn.errstr = errstr;
			
 
				-	swarn.dev = dev;
			
 
				+	swarn.dev = NULL;
			
 
				 	swarn.msg_bufsize = bufsize;
			
 
				 	swarn.scratch_bufsize = bufsize;
			
 
				 
			
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 
				 		} while (ret != 1);
			
 
				 	} else {
			
 
				 		swarn.path = path;
			
 
				+		swarn.dev = dev;
			
 
				 		iterate_extent_inodes(fs_info, found_key.objectid,
			
 
				 					extent_item_pos, 1,
			
 
				 					scrub_print_warning_inode, &swarn);
			
@@ -416,11 +571,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 
				 	kfree(swarn.msg_buf);
			
 
				 }
			
 
				 
			
 
				-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
			
 
				+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
			
 
				 {
			
 
				 	struct page *page = NULL;
			
 
				 	unsigned long index;
			
 
				-	struct scrub_fixup_nodatasum *fixup = ctx;
			
 
				+	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
			
 
				 	int ret;
			
 
				 	int corrected = 0;
			
 
				 	struct btrfs_key key;
			
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 
				 	}
			
 
				 
			
 
				 	if (PageUptodate(page)) {
			
 
				-		struct btrfs_mapping_tree *map_tree;
			
 
				+		struct btrfs_fs_info *fs_info;
			
 
				 		if (PageDirty(page)) {
			
 
				 			/*
			
 
				 			 * we need to write the data to the defect sector. the
			
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 
				 			ret = -EIO;
			
 
				 			goto out;
			
 
				 		}
			
 
				-		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
			
 
				-		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
			
 
				+		fs_info = BTRFS_I(inode)->root->fs_info;
			
 
				+		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
			
 
				 					fixup->logical, page,
			
 
				 					fixup->mirror_num);
			
 
				 		unlock_page(page);
			
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 
				 {
			
 
				 	int ret;
			
 
				 	struct scrub_fixup_nodatasum *fixup;
			
 
				-	struct scrub_dev *sdev;
			
 
				+	struct scrub_ctx *sctx;
			
 
				 	struct btrfs_trans_handle *trans = NULL;
			
 
				 	struct btrfs_fs_info *fs_info;
			
 
				 	struct btrfs_path *path;
			
 
				 	int uncorrectable = 0;
			
 
				 
			
 
				 	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
			
 
				-	sdev = fixup->sdev;
			
 
				+	sctx = fixup->sctx;
			
 
				 	fs_info = fixup->root->fs_info;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		++sdev->stat.malloc_errors;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		++sctx->stat.malloc_errors;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		uncorrectable = 1;
			
 
				 		goto out;
			
 
				 	}
			
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 
				 	}
			
 
				 	WARN_ON(ret != 1);
			
 
				 
			
 
				-	spin_lock(&sdev->stat_lock);
			
 
				-	++sdev->stat.corrected_errors;
			
 
				-	spin_unlock(&sdev->stat_lock);
			
 
				+	spin_lock(&sctx->stat_lock);
			
 
				+	++sctx->stat.corrected_errors;
			
 
				+	spin_unlock(&sctx->stat_lock);
			
 
				 
			
 
				 out:
			
 
				 	if (trans && !IS_ERR(trans))
			
 
				 		btrfs_end_transaction(trans, fixup->root);
			
 
				 	if (uncorrectable) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		++sdev->stat.uncorrectable_errors;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				-
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		++sctx->stat.uncorrectable_errors;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		btrfs_dev_replace_stats_inc(
			
 
				+			&sctx->dev_root->fs_info->dev_replace.
			
 
				+			num_uncorrectable_read_errors);
			
 
				 		printk_ratelimited_in_rcu(KERN_ERR
			
 
				 			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
			
 
				 			(unsigned long long)fixup->logical,
			
 
				-			rcu_str_deref(sdev->dev->name));
			
 
				+			rcu_str_deref(fixup->dev->name));
			
 
				 	}
			
 
				 
			
 
				 	btrfs_free_path(path);
			
 
				 	kfree(fixup);
			
 
				 
			
 
				-	/* see caller why we're pretending to be paused in the scrub counters */
			
 
				-	mutex_lock(&fs_info->scrub_lock);
			
 
				-	atomic_dec(&fs_info->scrubs_running);
			
 
				-	atomic_dec(&fs_info->scrubs_paused);
			
 
				-	mutex_unlock(&fs_info->scrub_lock);
			
 
				-	atomic_dec(&sdev->fixup_cnt);
			
 
				-	wake_up(&fs_info->scrub_pause_wait);
			
 
				-	wake_up(&sdev->list_wait);
			
 
				+	scrub_pending_trans_workers_dec(sctx);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -614,7 +764,8 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 
				  */
			
 
				 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
			
 
				 {
			
 
				-	struct scrub_dev *sdev = sblock_to_check->sdev;
			
 
				+	struct scrub_ctx *sctx = sblock_to_check->sctx;
			
 
				+	struct btrfs_device *dev;
			
 
				 	struct btrfs_fs_info *fs_info;
			
 
				 	u64 length;
			
 
				 	u64 logical;
			
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 				      DEFAULT_RATELIMIT_BURST);
			
 
				 
			
 
				 	BUG_ON(sblock_to_check->page_count < 1);
			
 
				-	fs_info = sdev->dev->dev_root->fs_info;
			
 
				+	fs_info = sctx->dev_root->fs_info;
			
 
				+	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
			
 
				+		/*
			
 
				+		 * if we find an error in a super block, we just report it.
			
 
				+		 * They will get written with the next transaction commit
			
 
				+		 * anyway
			
 
				+		 */
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		++sctx->stat.super_errors;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				 	length = sblock_to_check->page_count * PAGE_SIZE;
			
 
				-	logical = sblock_to_check->pagev[0].logical;
			
 
				-	generation = sblock_to_check->pagev[0].generation;
			
 
				-	BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
			
 
				-	failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
			
 
				-	is_metadata = !(sblock_to_check->pagev[0].flags &
			
 
				+	logical = sblock_to_check->pagev[0]->logical;
			
 
				+	generation = sblock_to_check->pagev[0]->generation;
			
 
				+	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
			
 
				+	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
			
 
				+	is_metadata = !(sblock_to_check->pagev[0]->flags &
			
 
				 			BTRFS_EXTENT_FLAG_DATA);
			
 
				-	have_csum = sblock_to_check->pagev[0].have_csum;
			
 
				-	csum = sblock_to_check->pagev[0].csum;
			
 
				+	have_csum = sblock_to_check->pagev[0]->have_csum;
			
 
				+	csum = sblock_to_check->pagev[0]->csum;
			
 
				+	dev = sblock_to_check->pagev[0]->dev;
			
 
				+
			
 
				+	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
			
 
				+		sblocks_for_recheck = NULL;
			
 
				+		goto nodatasum_case;
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * read all mirrors one after the other. This includes to
			
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 				     sizeof(*sblocks_for_recheck),
			
 
				 				     GFP_NOFS);
			
 
				 	if (!sblocks_for_recheck) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.malloc_errors++;
			
 
				-		sdev->stat.read_errors++;
			
 
				-		sdev->stat.uncorrectable_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				-		btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				-					     BTRFS_DEV_STAT_READ_ERRS);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		sctx->stat.read_errors++;
			
 
				+		sctx->stat.uncorrectable_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				 	/* setup the context, map the logical blocks and alloc the pages */
			
 
				-	ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
			
 
				+	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
			
 
				 					logical, sblocks_for_recheck);
			
 
				 	if (ret) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.read_errors++;
			
 
				-		sdev->stat.uncorrectable_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				-		btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				-					     BTRFS_DEV_STAT_READ_ERRS);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.read_errors++;
			
 
				+		sctx->stat.uncorrectable_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
			
 
				 		goto out;
			
 
				 	}
			
 
				 	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
			
 
				 	sblock_bad = sblocks_for_recheck + failed_mirror_index;
			
 
				 
			
 
				 	/* build and submit the bios for the failed mirror, check checksums */
			
 
				-	ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
			
 
				-				  csum, generation, sdev->csum_size);
			
 
				-	if (ret) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.read_errors++;
			
 
				-		sdev->stat.uncorrectable_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				-		btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				-					     BTRFS_DEV_STAT_READ_ERRS);
			
 
				-		goto out;
			
 
				-	}
			
 
				+	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
			
 
				+			    csum, generation, sctx->csum_size);
			
 
				 
			
 
				 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
			
 
				 	    sblock_bad->no_io_error_seen) {
			
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 		 * different bio (usually one of the two latter cases is
			
 
				 		 * the cause)
			
 
				 		 */
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.unverified_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.unverified_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 
			
 
				+		if (sctx->is_dev_replace)
			
 
				+			scrub_write_block_to_dev_replace(sblock_bad);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				 	if (!sblock_bad->no_io_error_seen) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.read_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.read_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		if (__ratelimit(&_rs))
			
 
				 			scrub_print_warning("i/o error", sblock_to_check);
			
 
				-		btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				-					     BTRFS_DEV_STAT_READ_ERRS);
			
 
				+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
			
 
				 	} else if (sblock_bad->checksum_error) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.csum_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.csum_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		if (__ratelimit(&_rs))
			
 
				 			scrub_print_warning("checksum error", sblock_to_check);
			
 
				-		btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				+		btrfs_dev_stat_inc_and_print(dev,
			
 
				 					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
			
 
				 	} else if (sblock_bad->header_error) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.verify_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.verify_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		if (__ratelimit(&_rs))
			
 
				 			scrub_print_warning("checksum/header error",
			
 
				 					    sblock_to_check);
			
 
				 		if (sblock_bad->generation_error)
			
 
				-			btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				+			btrfs_dev_stat_inc_and_print(dev,
			
 
				 				BTRFS_DEV_STAT_GENERATION_ERRS);
			
 
				 		else
			
 
				-			btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				+			btrfs_dev_stat_inc_and_print(dev,
			
 
				 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
			
 
				 	}
			
 
				 
			
 
				-	if (sdev->readonly)
			
 
				+	if (sctx->readonly && !sctx->is_dev_replace)
			
 
				 		goto did_not_correct_error;
			
 
				 
			
 
				 	if (!is_metadata && !have_csum) {
			
 
				 		struct scrub_fixup_nodatasum *fixup_nodatasum;
			
 
				 
			
 
				+nodatasum_case:
			
 
				+		WARN_ON(sctx->is_dev_replace);
			
 
				+
			
 
				 		/*
			
 
				 		 * !is_metadata and !have_csum, this means that the data
			
 
				 		 * might not be COW'ed, that it might be modified
			
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
			
 
				 		if (!fixup_nodatasum)
			
 
				 			goto did_not_correct_error;
			
 
				-		fixup_nodatasum->sdev = sdev;
			
 
				+		fixup_nodatasum->sctx = sctx;
			
 
				+		fixup_nodatasum->dev = dev;
			
 
				 		fixup_nodatasum->logical = logical;
			
 
				 		fixup_nodatasum->root = fs_info->extent_root;
			
 
				 		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
			
 
				-		/*
			
 
				-		 * increment scrubs_running to prevent cancel requests from
			
 
				-		 * completing as long as a fixup worker is running. we must also
			
 
				-		 * increment scrubs_paused to prevent deadlocking on pause
			
 
				-		 * requests used for transactions commits (as the worker uses a
			
 
				-		 * transaction context). it is safe to regard the fixup worker
			
 
				-		 * as paused for all matters practical. effectively, we only
			
 
				-		 * avoid cancellation requests from completing.
			
 
				-		 */
			
 
				-		mutex_lock(&fs_info->scrub_lock);
			
 
				-		atomic_inc(&fs_info->scrubs_running);
			
 
				-		atomic_inc(&fs_info->scrubs_paused);
			
 
				-		mutex_unlock(&fs_info->scrub_lock);
			
 
				-		atomic_inc(&sdev->fixup_cnt);
			
 
				+		scrub_pending_trans_workers_inc(sctx);
			
 
				 		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
			
 
				 		btrfs_queue_worker(&fs_info->scrub_workers,
			
 
				 				   &fixup_nodatasum->work);
			
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 
			
 
				 	/*
			
 
				 	 * now build and submit the bios for the other mirrors, check
			
 
				-	 * checksums
			
 
				-	 */
			
 
				-	for (mirror_index = 0;
			
 
				-	     mirror_index < BTRFS_MAX_MIRRORS &&
			
 
				-	     sblocks_for_recheck[mirror_index].page_count > 0;
			
 
				-	     mirror_index++) {
			
 
				-		if (mirror_index == failed_mirror_index)
			
 
				-			continue;
			
 
				-
			
 
				-		/* build and submit the bios, check checksums */
			
 
				-		ret = scrub_recheck_block(fs_info,
			
 
				-					  sblocks_for_recheck + mirror_index,
			
 
				-					  is_metadata, have_csum, csum,
			
 
				-					  generation, sdev->csum_size);
			
 
				-		if (ret)
			
 
				-			goto did_not_correct_error;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * first try to pick the mirror which is completely without I/O
			
 
				+	 * checksums.
			
 
				+	 * First try to pick the mirror which is completely without I/O
			
 
				 	 * errors and also does not have a checksum error.
			
 
				 	 * If one is found, and if a checksum is present, the full block
			
 
				 	 * that is known to contain an error is rewritten. Afterwards
			
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 	     mirror_index < BTRFS_MAX_MIRRORS &&
			
 
				 	     sblocks_for_recheck[mirror_index].page_count > 0;
			
 
				 	     mirror_index++) {
			
 
				-		struct scrub_block *sblock_other = sblocks_for_recheck +
			
 
				-						   mirror_index;
			
 
				+		struct scrub_block *sblock_other;
			
 
				+
			
 
				+		if (mirror_index == failed_mirror_index)
			
 
				+			continue;
			
 
				+		sblock_other = sblocks_for_recheck + mirror_index;
			
 
				+
			
 
				+		/* build and submit the bios, check checksums */
			
 
				+		scrub_recheck_block(fs_info, sblock_other, is_metadata,
			
 
				+				    have_csum, csum, generation,
			
 
				+				    sctx->csum_size);
			
 
				 
			
 
				 		if (!sblock_other->header_error &&
			
 
				 		    !sblock_other->checksum_error &&
			
 
				 		    sblock_other->no_io_error_seen) {
			
 
				-			int force_write = is_metadata || have_csum;
			
 
				-
			
 
				-			ret = scrub_repair_block_from_good_copy(sblock_bad,
			
 
				-								sblock_other,
			
 
				-								force_write);
			
 
				+			if (sctx->is_dev_replace) {
			
 
				+				scrub_write_block_to_dev_replace(sblock_other);
			
 
				+			} else {
			
 
				+				int force_write = is_metadata || have_csum;
			
 
				+
			
 
				+				ret = scrub_repair_block_from_good_copy(
			
 
				+						sblock_bad, sblock_other,
			
 
				+						force_write);
			
 
				+			}
			
 
				 			if (0 == ret)
			
 
				 				goto corrected_error;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * in case of I/O errors in the area that is supposed to be
			
 
				+	 * for dev_replace, pick good pages and write to the target device.
			
 
				+	 */
			
 
				+	if (sctx->is_dev_replace) {
			
 
				+		success = 1;
			
 
				+		for (page_num = 0; page_num < sblock_bad->page_count;
			
 
				+		     page_num++) {
			
 
				+			int sub_success;
			
 
				+
			
 
				+			sub_success = 0;
			
 
				+			for (mirror_index = 0;
			
 
				+			     mirror_index < BTRFS_MAX_MIRRORS &&
			
 
				+			     sblocks_for_recheck[mirror_index].page_count > 0;
			
 
				+			     mirror_index++) {
			
 
				+				struct scrub_block *sblock_other =
			
 
				+					sblocks_for_recheck + mirror_index;
			
 
				+				struct scrub_page *page_other =
			
 
				+					sblock_other->pagev[page_num];
			
 
				+
			
 
				+				if (!page_other->io_error) {
			
 
				+					ret = scrub_write_page_to_dev_replace(
			
 
				+							sblock_other, page_num);
			
 
				+					if (ret == 0) {
			
 
				+						/* succeeded for this page */
			
 
				+						sub_success = 1;
			
 
				+						break;
			
 
				+					} else {
			
 
				+						btrfs_dev_replace_stats_inc(
			
 
				+							&sctx->dev_root->
			
 
				+							fs_info->dev_replace.
			
 
				+							num_write_errors);
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (!sub_success) {
			
 
				+				/*
			
 
				+				 * did not find a mirror to fetch the page
			
 
				+				 * from. scrub_write_page_to_dev_replace()
			
 
				+				 * handles this case (page->io_error), by
			
 
				+				 * filling the block with zeros before
			
 
				+				 * submitting the write request
			
 
				+				 */
			
 
				+				success = 0;
			
 
				+				ret = scrub_write_page_to_dev_replace(
			
 
				+						sblock_bad, page_num);
			
 
				+				if (ret)
			
 
				+					btrfs_dev_replace_stats_inc(
			
 
				+						&sctx->dev_root->fs_info->
			
 
				+						dev_replace.num_write_errors);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * for regular scrub, repair those pages that are errored.
			
 
				+	 * In case of I/O errors in the area that is supposed to be
			
 
				 	 * repaired, continue by picking good copies of those pages.
			
 
				 	 * Select the good pages from mirrors to rewrite bad pages from
			
 
				 	 * the area to fix. Afterwards verify the checksum of the block
			
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 
			
 
				 	success = 1;
			
 
				 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
			
 
				-		struct scrub_page *page_bad = sblock_bad->pagev + page_num;
			
 
				+		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
			
 
				 
			
 
				 		if (!page_bad->io_error)
			
 
				 			continue;
			
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 		     mirror_index++) {
			
 
				 			struct scrub_block *sblock_other = sblocks_for_recheck +
			
 
				 							   mirror_index;
			
 
				-			struct scrub_page *page_other = sblock_other->pagev +
			
 
				-							page_num;
			
 
				+			struct scrub_page *page_other = sblock_other->pagev[
			
 
				+							page_num];
			
 
				 
			
 
				 			if (!page_other->io_error) {
			
 
				 				ret = scrub_repair_page_from_good_copy(
			
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 			 * is verified, but most likely the data comes out
			
 
				 			 * of the page cache.
			
 
				 			 */
			
 
				-			ret = scrub_recheck_block(fs_info, sblock_bad,
			
 
				-						  is_metadata, have_csum, csum,
			
 
				-						  generation, sdev->csum_size);
			
 
				-			if (!ret && !sblock_bad->header_error &&
			
 
				+			scrub_recheck_block(fs_info, sblock_bad,
			
 
				+					    is_metadata, have_csum, csum,
			
 
				+					    generation, sctx->csum_size);
			
 
				+			if (!sblock_bad->header_error &&
			
 
				 			    !sblock_bad->checksum_error &&
			
 
				 			    sblock_bad->no_io_error_seen)
			
 
				 				goto corrected_error;
			
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 				goto did_not_correct_error;
			
 
				 		} else {
			
 
				 corrected_error:
			
 
				-			spin_lock(&sdev->stat_lock);
			
 
				-			sdev->stat.corrected_errors++;
			
 
				-			spin_unlock(&sdev->stat_lock);
			
 
				+			spin_lock(&sctx->stat_lock);
			
 
				+			sctx->stat.corrected_errors++;
			
 
				+			spin_unlock(&sctx->stat_lock);
			
 
				 			printk_ratelimited_in_rcu(KERN_ERR
			
 
				 				"btrfs: fixed up error at logical %llu on dev %s\n",
			
 
				 				(unsigned long long)logical,
			
 
				-				rcu_str_deref(sdev->dev->name));
			
 
				+				rcu_str_deref(dev->name));
			
 
				 		}
			
 
				 	} else {
			
 
				 did_not_correct_error:
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.uncorrectable_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.uncorrectable_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		printk_ratelimited_in_rcu(KERN_ERR
			
 
				 			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
			
 
				 			(unsigned long long)logical,
			
 
				-			rcu_str_deref(sdev->dev->name));
			
 
				+			rcu_str_deref(dev->name));
			
 
				 	}
			
 
				 
			
 
				 out:
			
@@ -966,11 +1166,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 						     mirror_index;
			
 
				 			int page_index;
			
 
				 
			
 
				-			for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
			
 
				-			     page_index++)
			
 
				-				if (sblock->pagev[page_index].page)
			
 
				-					__free_page(
			
 
				-						sblock->pagev[page_index].page);
			
 
				+			for (page_index = 0; page_index < sblock->page_count;
			
 
				+			     page_index++) {
			
 
				+				sblock->pagev[page_index]->sblock = NULL;
			
 
				+				scrub_page_put(sblock->pagev[page_index]);
			
 
				+			}
			
 
				 		}
			
 
				 		kfree(sblocks_for_recheck);
			
 
				 	}
			
@@ -978,8 +1178,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
			
 
				-				     struct btrfs_mapping_tree *map_tree,
			
 
				+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
			
 
				+				     struct btrfs_fs_info *fs_info,
			
 
				+				     struct scrub_block *original_sblock,
			
 
				 				     u64 length, u64 logical,
			
 
				 				     struct scrub_block *sblocks_for_recheck)
			
 
				 {
			
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 
				 	int ret;
			
 
				 
			
 
				 	/*
			
 
				-	 * note: the three members sdev, ref_count and outstanding_pages
			
 
				+	 * note: the two members ref_count and outstanding_pages
			
 
				 	 * are not used (and not set) in the blocks that are used for
			
 
				 	 * the recheck procedure
			
 
				 	 */
			
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 
				 		 * with a length of PAGE_SIZE, each returned stripe
			
 
				 		 * represents one mirror
			
 
				 		 */
			
 
				-		ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
			
 
				-				      &bbio, 0);
			
 
				+		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
			
 
				+				      &mapped_length, &bbio, 0);
			
 
				 		if (ret || !bbio || mapped_length < sublen) {
			
 
				 			kfree(bbio);
			
 
				 			return -EIO;
			
 
				 		}
			
 
				 
			
 
				-		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
			
 
				+		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
			
 
				 		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
			
 
				 		     mirror_index++) {
			
 
				 			struct scrub_block *sblock;
			
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 
				 				continue;
			
 
				 
			
 
				 			sblock = sblocks_for_recheck + mirror_index;
			
 
				-			page = sblock->pagev + page_index;
			
 
				+			sblock->sctx = sctx;
			
 
				+			page = kzalloc(sizeof(*page), GFP_NOFS);
			
 
				+			if (!page) {
			
 
				+leave_nomem:
			
 
				+				spin_lock(&sctx->stat_lock);
			
 
				+				sctx->stat.malloc_errors++;
			
 
				+				spin_unlock(&sctx->stat_lock);
			
 
				+				kfree(bbio);
			
 
				+				return -ENOMEM;
			
 
				+			}
			
 
				+			scrub_page_get(page);
			
 
				+			sblock->pagev[page_index] = page;
			
 
				 			page->logical = logical;
			
 
				 			page->physical = bbio->stripes[mirror_index].physical;
			
 
				+			BUG_ON(page_index >= original_sblock->page_count);
			
 
				+			page->physical_for_dev_replace =
			
 
				+				original_sblock->pagev[page_index]->
			
 
				+				physical_for_dev_replace;
			
 
				 			/* for missing devices, dev->bdev is NULL */
			
 
				 			page->dev = bbio->stripes[mirror_index].dev;
			
 
				 			page->mirror_num = mirror_index + 1;
			
 
				-			page->page = alloc_page(GFP_NOFS);
			
 
				-			if (!page->page) {
			
 
				-				spin_lock(&sdev->stat_lock);
			
 
				-				sdev->stat.malloc_errors++;
			
 
				-				spin_unlock(&sdev->stat_lock);
			
 
				-				kfree(bbio);
			
 
				-				return -ENOMEM;
			
 
				-			}
			
 
				 			sblock->page_count++;
			
 
				+			page->page = alloc_page(GFP_NOFS);
			
 
				+			if (!page->page)
			
 
				+				goto leave_nomem;
			
 
				 		}
			
 
				 		kfree(bbio);
			
 
				 		length -= sublen;
			
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 
				  * to take those pages that are not errored from all the mirrors so that
			
 
				  * the pages that are errored in the just handled mirror can be repaired.
			
 
				  */
			
 
				-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
			
 
				-			       struct scrub_block *sblock, int is_metadata,
			
 
				-			       int have_csum, u8 *csum, u64 generation,
			
 
				-			       u16 csum_size)
			
 
				+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
			
 
				+				struct scrub_block *sblock, int is_metadata,
			
 
				+				int have_csum, u8 *csum, u64 generation,
			
 
				+				u16 csum_size)
			
 
				 {
			
 
				 	int page_num;
			
 
				 
			
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
				 
			
 
				 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
			
 
				 		struct bio *bio;
			
 
				-		int ret;
			
 
				-		struct scrub_page *page = sblock->pagev + page_num;
			
 
				+		struct scrub_page *page = sblock->pagev[page_num];
			
 
				 		DECLARE_COMPLETION_ONSTACK(complete);
			
 
				 
			
 
				 		if (page->dev->bdev == NULL) {
			
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		BUG_ON(!page->page);
			
 
				+		WARN_ON(!page->page);
			
 
				 		bio = bio_alloc(GFP_NOFS, 1);
			
 
				-		if (!bio)
			
 
				-			return -EIO;
			
 
				+		if (!bio) {
			
 
				+			page->io_error = 1;
			
 
				+			sblock->no_io_error_seen = 0;
			
 
				+			continue;
			
 
				+		}
			
 
				 		bio->bi_bdev = page->dev->bdev;
			
 
				 		bio->bi_sector = page->physical >> 9;
			
 
				 		bio->bi_end_io = scrub_complete_bio_end_io;
			
 
				 		bio->bi_private = &complete;
			
 
				 
			
 
				-		ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
			
 
				-		if (PAGE_SIZE != ret) {
			
 
				-			bio_put(bio);
			
 
				-			return -EIO;
			
 
				-		}
			
 
				+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
			
 
				 		btrfsic_submit_bio(READ, bio);
			
 
				 
			
 
				 		/* this will also unplug the queue */
			
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
				 					     have_csum, csum, generation,
			
 
				 					     csum_size);
			
 
				 
			
 
				-	return 0;
			
 
				+	return;
			
 
				 }
			
 
				 
			
 
				 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
			
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 
				 	struct btrfs_root *root = fs_info->extent_root;
			
 
				 	void *mapped_buffer;
			
 
				 
			
 
				-	BUG_ON(!sblock->pagev[0].page);
			
 
				+	WARN_ON(!sblock->pagev[0]->page);
			
 
				 	if (is_metadata) {
			
 
				 		struct btrfs_header *h;
			
 
				 
			
 
				-		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
			
 
				+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
			
 
				 		h = (struct btrfs_header *)mapped_buffer;
			
 
				 
			
 
				-		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
			
 
				+		if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
			
 
				 		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
			
 
				 		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
			
 
				 			   BTRFS_UUID_SIZE)) {
			
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 
				 		if (!have_csum)
			
 
				 			return;
			
 
				 
			
 
				-		mapped_buffer = kmap_atomic(sblock->pagev[0].page);
			
 
				+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
			
 
				 	}
			
 
				 
			
 
				 	for (page_num = 0;;) {
			
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 
				 		page_num++;
			
 
				 		if (page_num >= sblock->page_count)
			
 
				 			break;
			
 
				-		BUG_ON(!sblock->pagev[page_num].page);
			
 
				+		WARN_ON(!sblock->pagev[page_num]->page);
			
 
				 
			
 
				-		mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
			
 
				+		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
			
 
				 	}
			
 
				 
			
 
				 	btrfs_csum_final(crc, calculated_csum);
			
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 
				 					    struct scrub_block *sblock_good,
			
 
				 					    int page_num, int force_write)
			
 
				 {
			
 
				-	struct scrub_page *page_bad = sblock_bad->pagev + page_num;
			
 
				-	struct scrub_page *page_good = sblock_good->pagev + page_num;
			
 
				+	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
			
 
				+	struct scrub_page *page_good = sblock_good->pagev[page_num];
			
 
				 
			
 
				-	BUG_ON(sblock_bad->pagev[page_num].page == NULL);
			
 
				-	BUG_ON(sblock_good->pagev[page_num].page == NULL);
			
 
				+	BUG_ON(page_bad->page == NULL);
			
 
				+	BUG_ON(page_good->page == NULL);
			
 
				 	if (force_write || sblock_bad->header_error ||
			
 
				 	    sblock_bad->checksum_error || page_bad->io_error) {
			
 
				 		struct bio *bio;
			
 
				 		int ret;
			
 
				 		DECLARE_COMPLETION_ONSTACK(complete);
			
 
				 
			
 
				+		if (!page_bad->dev->bdev) {
			
 
				+			printk_ratelimited(KERN_WARNING
			
 
				+				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+
			
 
				 		bio = bio_alloc(GFP_NOFS, 1);
			
 
				 		if (!bio)
			
 
				 			return -EIO;
			
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 
				 		if (!bio_flagged(bio, BIO_UPTODATE)) {
			
 
				 			btrfs_dev_stat_inc_and_print(page_bad->dev,
			
 
				 				BTRFS_DEV_STAT_WRITE_ERRS);
			
 
				+			btrfs_dev_replace_stats_inc(
			
 
				+				&sblock_bad->sctx->dev_root->fs_info->
			
 
				+				dev_replace.num_write_errors);
			
 
				 			bio_put(bio);
			
 
				 			return -EIO;
			
 
				 		}
			
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void scrub_checksum(struct scrub_block *sblock)
			
 
				+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
			
 
				+{
			
 
				+	int page_num;
			
 
				+
			
 
				+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
			
 
				+		int ret;
			
 
				+
			
 
				+		ret = scrub_write_page_to_dev_replace(sblock, page_num);
			
 
				+		if (ret)
			
 
				+			btrfs_dev_replace_stats_inc(
			
 
				+				&sblock->sctx->dev_root->fs_info->dev_replace.
			
 
				+				num_write_errors);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
			
 
				+					   int page_num)
			
 
				+{
			
 
				+	struct scrub_page *spage = sblock->pagev[page_num];
			
 
				+
			
 
				+	BUG_ON(spage->page == NULL);
			
 
				+	if (spage->io_error) {
			
 
				+		void *mapped_buffer = kmap_atomic(spage->page);
			
 
				+
			
 
				+		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
			
 
				+		flush_dcache_page(spage->page);
			
 
				+		kunmap_atomic(mapped_buffer);
			
 
				+	}
			
 
				+	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
			
 
				+}
			
 
				+
			
 
				+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
			
 
				+				    struct scrub_page *spage)
			
 
				+{
			
 
				+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
			
 
				+	struct scrub_bio *sbio;
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&wr_ctx->wr_lock);
			
 
				+again:
			
 
				+	if (!wr_ctx->wr_curr_bio) {
			
 
				+		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
			
 
				+					      GFP_NOFS);
			
 
				+		if (!wr_ctx->wr_curr_bio) {
			
 
				+			mutex_unlock(&wr_ctx->wr_lock);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+		wr_ctx->wr_curr_bio->sctx = sctx;
			
 
				+		wr_ctx->wr_curr_bio->page_count = 0;
			
 
				+	}
			
 
				+	sbio = wr_ctx->wr_curr_bio;
			
 
				+	if (sbio->page_count == 0) {
			
 
				+		struct bio *bio;
			
 
				+
			
 
				+		sbio->physical = spage->physical_for_dev_replace;
			
 
				+		sbio->logical = spage->logical;
			
 
				+		sbio->dev = wr_ctx->tgtdev;
			
 
				+		bio = sbio->bio;
			
 
				+		if (!bio) {
			
 
				+			bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
			
 
				+			if (!bio) {
			
 
				+				mutex_unlock(&wr_ctx->wr_lock);
			
 
				+				return -ENOMEM;
			
 
				+			}
			
 
				+			sbio->bio = bio;
			
 
				+		}
			
 
				+
			
 
				+		bio->bi_private = sbio;
			
 
				+		bio->bi_end_io = scrub_wr_bio_end_io;
			
 
				+		bio->bi_bdev = sbio->dev->bdev;
			
 
				+		bio->bi_sector = sbio->physical >> 9;
			
 
				+		sbio->err = 0;
			
 
				+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
			
 
				+		   spage->physical_for_dev_replace ||
			
 
				+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
			
 
				+		   spage->logical) {
			
 
				+		scrub_wr_submit(sctx);
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
			
 
				+	if (ret != PAGE_SIZE) {
			
 
				+		if (sbio->page_count < 1) {
			
 
				+			bio_put(sbio->bio);
			
 
				+			sbio->bio = NULL;
			
 
				+			mutex_unlock(&wr_ctx->wr_lock);
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+		scrub_wr_submit(sctx);
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	sbio->pagev[sbio->page_count] = spage;
			
 
				+	scrub_page_get(spage);
			
 
				+	sbio->page_count++;
			
 
				+	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
			
 
				+		scrub_wr_submit(sctx);
			
 
				+	mutex_unlock(&wr_ctx->wr_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void scrub_wr_submit(struct scrub_ctx *sctx)
			
 
				+{
			
 
				+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
			
 
				+	struct scrub_bio *sbio;
			
 
				+
			
 
				+	if (!wr_ctx->wr_curr_bio)
			
 
				+		return;
			
 
				+
			
 
				+	sbio = wr_ctx->wr_curr_bio;
			
 
				+	wr_ctx->wr_curr_bio = NULL;
			
 
				+	WARN_ON(!sbio->bio->bi_bdev);
			
 
				+	scrub_pending_bio_inc(sctx);
			
 
				+	/* process all writes in a single worker thread. Then the block layer
			
 
				+	 * orders the requests before sending them to the driver which
			
 
				+	 * doubled the write performance on spinning disks when measured
			
 
				+	 * with Linux 3.5 */
			
 
				+	btrfsic_submit_bio(WRITE, sbio->bio);
			
 
				+}
			
 
				+
			
 
				+static void scrub_wr_bio_end_io(struct bio *bio, int err)
			
 
				+{
			
 
				+	struct scrub_bio *sbio = bio->bi_private;
			
 
				+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
			
 
				+
			
 
				+	sbio->err = err;
			
 
				+	sbio->bio = bio;
			
 
				+
			
 
				+	sbio->work.func = scrub_wr_bio_end_io_worker;
			
 
				+	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
			
 
				+}
			
 
				+
			
 
				+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
			
 
				+{
			
 
				+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
			
 
				+	struct scrub_ctx *sctx = sbio->sctx;
			
 
				+	int i;
			
 
				+
			
 
				+	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
			
 
				+	if (sbio->err) {
			
 
				+		struct btrfs_dev_replace *dev_replace =
			
 
				+			&sbio->sctx->dev_root->fs_info->dev_replace;
			
 
				+
			
 
				+		for (i = 0; i < sbio->page_count; i++) {
			
 
				+			struct scrub_page *spage = sbio->pagev[i];
			
 
				+
			
 
				+			spage->io_error = 1;
			
 
				+			btrfs_dev_replace_stats_inc(&dev_replace->
			
 
				+						    num_write_errors);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < sbio->page_count; i++)
			
 
				+		scrub_page_put(sbio->pagev[i]);
			
 
				+
			
 
				+	bio_put(sbio->bio);
			
 
				+	kfree(sbio);
			
 
				+	scrub_pending_bio_dec(sctx);
			
 
				+}
			
 
				+
			
 
				+static int scrub_checksum(struct scrub_block *sblock)
			
 
				 {
			
 
				 	u64 flags;
			
 
				 	int ret;
			
 
				 
			
 
				-	BUG_ON(sblock->page_count < 1);
			
 
				-	flags = sblock->pagev[0].flags;
			
 
				+	WARN_ON(sblock->page_count < 1);
			
 
				+	flags = sblock->pagev[0]->flags;
			
 
				 	ret = 0;
			
 
				 	if (flags & BTRFS_EXTENT_FLAG_DATA)
			
 
				 		ret = scrub_checksum_data(sblock);
			
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
 
				 		WARN_ON(1);
			
 
				 	if (ret)
			
 
				 		scrub_handle_errored_block(sblock);
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static int scrub_checksum_data(struct scrub_block *sblock)
			
 
				 {
			
 
				-	struct scrub_dev *sdev = sblock->sdev;
			
 
				+	struct scrub_ctx *sctx = sblock->sctx;
			
 
				 	u8 csum[BTRFS_CSUM_SIZE];
			
 
				 	u8 *on_disk_csum;
			
 
				 	struct page *page;
			
 
				 	void *buffer;
			
 
				 	u32 crc = ~(u32)0;
			
 
				 	int fail = 0;
			
 
				-	struct btrfs_root *root = sdev->dev->dev_root;
			
 
				+	struct btrfs_root *root = sctx->dev_root;
			
 
				 	u64 len;
			
 
				 	int index;
			
 
				 
			
 
				 	BUG_ON(sblock->page_count < 1);
			
 
				-	if (!sblock->pagev[0].have_csum)
			
 
				+	if (!sblock->pagev[0]->have_csum)
			
 
				 		return 0;
			
 
				 
			
 
				-	on_disk_csum = sblock->pagev[0].csum;
			
 
				-	page = sblock->pagev[0].page;
			
 
				+	on_disk_csum = sblock->pagev[0]->csum;
			
 
				+	page = sblock->pagev[0]->page;
			
 
				 	buffer = kmap_atomic(page);
			
 
				 
			
 
				-	len = sdev->sectorsize;
			
 
				+	len = sctx->sectorsize;
			
 
				 	index = 0;
			
 
				 	for (;;) {
			
 
				 		u64 l = min_t(u64, len, PAGE_SIZE);
			
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 
				 			break;
			
 
				 		index++;
			
 
				 		BUG_ON(index >= sblock->page_count);
			
 
				-		BUG_ON(!sblock->pagev[index].page);
			
 
				-		page = sblock->pagev[index].page;
			
 
				+		BUG_ON(!sblock->pagev[index]->page);
			
 
				+		page = sblock->pagev[index]->page;
			
 
				 		buffer = kmap_atomic(page);
			
 
				 	}
			
 
				 
			
 
				 	btrfs_csum_final(crc, csum);
			
 
				-	if (memcmp(csum, on_disk_csum, sdev->csum_size))
			
 
				+	if (memcmp(csum, on_disk_csum, sctx->csum_size))
			
 
				 		fail = 1;
			
 
				 
			
 
				 	return fail;
			
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 
				 
			
 
				 static int scrub_checksum_tree_block(struct scrub_block *sblock)
			
 
				 {
			
 
				-	struct scrub_dev *sdev = sblock->sdev;
			
 
				+	struct scrub_ctx *sctx = sblock->sctx;
			
 
				 	struct btrfs_header *h;
			
 
				-	struct btrfs_root *root = sdev->dev->dev_root;
			
 
				+	struct btrfs_root *root = sctx->dev_root;
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	u8 calculated_csum[BTRFS_CSUM_SIZE];
			
 
				 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
			
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 
				 	int index;
			
 
				 
			
 
				 	BUG_ON(sblock->page_count < 1);
			
 
				-	page = sblock->pagev[0].page;
			
 
				+	page = sblock->pagev[0]->page;
			
 
				 	mapped_buffer = kmap_atomic(page);
			
 
				 	h = (struct btrfs_header *)mapped_buffer;
			
 
				-	memcpy(on_disk_csum, h->csum, sdev->csum_size);
			
 
				+	memcpy(on_disk_csum, h->csum, sctx->csum_size);
			
 
				 
			
 
				 	/*
			
 
				 	 * we don't use the getter functions here, as we
			
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 
				 	 * b) the page is already kmapped
			
 
				 	 */
			
 
				 
			
 
				-	if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
			
 
				+	if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
			
 
				 		++fail;
			
 
				 
			
 
				-	if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
			
 
				+	if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
			
 
				 		++fail;
			
 
				 
			
 
				 	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
			
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 
				 		   BTRFS_UUID_SIZE))
			
 
				 		++fail;
			
 
				 
			
 
				-	BUG_ON(sdev->nodesize != sdev->leafsize);
			
 
				-	len = sdev->nodesize - BTRFS_CSUM_SIZE;
			
 
				+	WARN_ON(sctx->nodesize != sctx->leafsize);
			
 
				+	len = sctx->nodesize - BTRFS_CSUM_SIZE;
			
 
				 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
			
 
				 	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
			
 
				 	index = 0;
			
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 
				 			break;
			
 
				 		index++;
			
 
				 		BUG_ON(index >= sblock->page_count);
			
 
				-		BUG_ON(!sblock->pagev[index].page);
			
 
				-		page = sblock->pagev[index].page;
			
 
				+		BUG_ON(!sblock->pagev[index]->page);
			
 
				+		page = sblock->pagev[index]->page;
			
 
				 		mapped_buffer = kmap_atomic(page);
			
 
				 		mapped_size = PAGE_SIZE;
			
 
				 		p = mapped_buffer;
			
 
				 	}
			
 
				 
			
 
				 	btrfs_csum_final(crc, calculated_csum);
			
 
				-	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
			
 
				+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
			
 
				 		++crc_fail;
			
 
				 
			
 
				 	return fail || crc_fail;
			
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 
				 static int scrub_checksum_super(struct scrub_block *sblock)
			
 
				 {
			
 
				 	struct btrfs_super_block *s;
			
 
				-	struct scrub_dev *sdev = sblock->sdev;
			
 
				-	struct btrfs_root *root = sdev->dev->dev_root;
			
 
				+	struct scrub_ctx *sctx = sblock->sctx;
			
 
				+	struct btrfs_root *root = sctx->dev_root;
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	u8 calculated_csum[BTRFS_CSUM_SIZE];
			
 
				 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
			
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
				 	int index;
			
 
				 
			
 
				 	BUG_ON(sblock->page_count < 1);
			
 
				-	page = sblock->pagev[0].page;
			
 
				+	page = sblock->pagev[0]->page;
			
 
				 	mapped_buffer = kmap_atomic(page);
			
 
				 	s = (struct btrfs_super_block *)mapped_buffer;
			
 
				-	memcpy(on_disk_csum, s->csum, sdev->csum_size);
			
 
				+	memcpy(on_disk_csum, s->csum, sctx->csum_size);
			
 
				 
			
 
				-	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
			
 
				+	if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
			
 
				 		++fail_cor;
			
 
				 
			
 
				-	if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
			
 
				+	if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
			
 
				 		++fail_gen;
			
 
				 
			
 
				 	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
			
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
				 			break;
			
 
				 		index++;
			
 
				 		BUG_ON(index >= sblock->page_count);
			
 
				-		BUG_ON(!sblock->pagev[index].page);
			
 
				-		page = sblock->pagev[index].page;
			
 
				+		BUG_ON(!sblock->pagev[index]->page);
			
 
				+		page = sblock->pagev[index]->page;
			
 
				 		mapped_buffer = kmap_atomic(page);
			
 
				 		mapped_size = PAGE_SIZE;
			
 
				 		p = mapped_buffer;
			
 
				 	}
			
 
				 
			
 
				 	btrfs_csum_final(crc, calculated_csum);
			
 
				-	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
			
 
				+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
			
 
				 		++fail_cor;
			
 
				 
			
 
				 	if (fail_cor + fail_gen) {
			
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
				 		 * They will get written with the next transaction commit
			
 
				 		 * anyway
			
 
				 		 */
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		++sdev->stat.super_errors;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		++sctx->stat.super_errors;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		if (fail_cor)
			
 
				-			btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
			
 
				 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
			
 
				 		else
			
 
				-			btrfs_dev_stat_inc_and_print(sdev->dev,
			
 
				+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
			
 
				 				BTRFS_DEV_STAT_GENERATION_ERRS);
			
 
				 	}
			
 
				 
			
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
 
				 		int i;
			
 
				 
			
 
				 		for (i = 0; i < sblock->page_count; i++)
			
 
				-			if (sblock->pagev[i].page)
			
 
				-				__free_page(sblock->pagev[i].page);
			
 
				+			scrub_page_put(sblock->pagev[i]);
			
 
				 		kfree(sblock);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void scrub_submit(struct scrub_dev *sdev)
			
 
				+static void scrub_page_get(struct scrub_page *spage)
			
 
				+{
			
 
				+	atomic_inc(&spage->ref_count);
			
 
				+}
			
 
				+
			
 
				+static void scrub_page_put(struct scrub_page *spage)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&spage->ref_count)) {
			
 
				+		if (spage->page)
			
 
				+			__free_page(spage->page);
			
 
				+		kfree(spage);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void scrub_submit(struct scrub_ctx *sctx)
			
 
				 {
			
 
				 	struct scrub_bio *sbio;
			
 
				 
			
 
				-	if (sdev->curr == -1)
			
 
				+	if (sctx->curr == -1)
			
 
				 		return;
			
 
				 
			
 
				-	sbio = sdev->bios[sdev->curr];
			
 
				-	sdev->curr = -1;
			
 
				-	atomic_inc(&sdev->in_flight);
			
 
				+	sbio = sctx->bios[sctx->curr];
			
 
				+	sctx->curr = -1;
			
 
				+	scrub_pending_bio_inc(sctx);
			
 
				 
			
 
				-	btrfsic_submit_bio(READ, sbio->bio);
			
 
				+	if (!sbio->bio->bi_bdev) {
			
 
				+		/*
			
 
				+		 * this case should not happen. If btrfs_map_block() is
			
 
				+		 * wrong, it could happen for dev-replace operations on
			
 
				+		 * missing devices when no mirrors are available, but in
			
 
				+		 * this case it should already fail the mount.
			
 
				+		 * This case is handled correctly (but _very_ slowly).
			
 
				+		 */
			
 
				+		printk_ratelimited(KERN_WARNING
			
 
				+			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
			
 
				+		bio_endio(sbio->bio, -EIO);
			
 
				+	} else {
			
 
				+		btrfsic_submit_bio(READ, sbio->bio);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
			
 
				-				 struct scrub_page *spage)
			
 
				+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
			
 
				+				    struct scrub_page *spage)
			
 
				 {
			
 
				 	struct scrub_block *sblock = spage->sblock;
			
 
				 	struct scrub_bio *sbio;
			
@@ -1494,28 +1901,29 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 
				 	/*
			
 
				 	 * grab a fresh bio or wait for one to become available
			
 
				 	 */
			
 
				-	while (sdev->curr == -1) {
			
 
				-		spin_lock(&sdev->list_lock);
			
 
				-		sdev->curr = sdev->first_free;
			
 
				-		if (sdev->curr != -1) {
			
 
				-			sdev->first_free = sdev->bios[sdev->curr]->next_free;
			
 
				-			sdev->bios[sdev->curr]->next_free = -1;
			
 
				-			sdev->bios[sdev->curr]->page_count = 0;
			
 
				-			spin_unlock(&sdev->list_lock);
			
 
				+	while (sctx->curr == -1) {
			
 
				+		spin_lock(&sctx->list_lock);
			
 
				+		sctx->curr = sctx->first_free;
			
 
				+		if (sctx->curr != -1) {
			
 
				+			sctx->first_free = sctx->bios[sctx->curr]->next_free;
			
 
				+			sctx->bios[sctx->curr]->next_free = -1;
			
 
				+			sctx->bios[sctx->curr]->page_count = 0;
			
 
				+			spin_unlock(&sctx->list_lock);
			
 
				 		} else {
			
 
				-			spin_unlock(&sdev->list_lock);
			
 
				-			wait_event(sdev->list_wait, sdev->first_free != -1);
			
 
				+			spin_unlock(&sctx->list_lock);
			
 
				+			wait_event(sctx->list_wait, sctx->first_free != -1);
			
 
				 		}
			
 
				 	}
			
 
				-	sbio = sdev->bios[sdev->curr];
			
 
				+	sbio = sctx->bios[sctx->curr];
			
 
				 	if (sbio->page_count == 0) {
			
 
				 		struct bio *bio;
			
 
				 
			
 
				 		sbio->physical = spage->physical;
			
 
				 		sbio->logical = spage->logical;
			
 
				+		sbio->dev = spage->dev;
			
 
				 		bio = sbio->bio;
			
 
				 		if (!bio) {
			
 
				-			bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
			
 
				+			bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
			
 
				 			if (!bio)
			
 
				 				return -ENOMEM;
			
 
				 			sbio->bio = bio;
			
@@ -1523,14 +1931,15 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 
				 
			
 
				 		bio->bi_private = sbio;
			
 
				 		bio->bi_end_io = scrub_bio_end_io;
			
 
				-		bio->bi_bdev = sdev->dev->bdev;
			
 
				-		bio->bi_sector = spage->physical >> 9;
			
 
				+		bio->bi_bdev = sbio->dev->bdev;
			
 
				+		bio->bi_sector = sbio->physical >> 9;
			
 
				 		sbio->err = 0;
			
 
				 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
			
 
				 		   spage->physical ||
			
 
				 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
			
 
				-		   spage->logical) {
			
 
				-		scrub_submit(sdev);
			
 
				+		   spage->logical ||
			
 
				+		   sbio->dev != spage->dev) {
			
 
				+		scrub_submit(sctx);
			
 
				 		goto again;
			
 
				 	}
			
 
				 
			
@@ -1542,81 +1951,87 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 
				 			sbio->bio = NULL;
			
 
				 			return -EIO;
			
 
				 		}
			
 
				-		scrub_submit(sdev);
			
 
				+		scrub_submit(sctx);
			
 
				 		goto again;
			
 
				 	}
			
 
				 
			
 
				-	scrub_block_get(sblock); /* one for the added page */
			
 
				+	scrub_block_get(sblock); /* one for the page added to the bio */
			
 
				 	atomic_inc(&sblock->outstanding_pages);
			
 
				 	sbio->page_count++;
			
 
				-	if (sbio->page_count == sdev->pages_per_bio)
			
 
				-		scrub_submit(sdev);
			
 
				+	if (sbio->page_count == sctx->pages_per_rd_bio)
			
 
				+		scrub_submit(sctx);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
			
 
				-		       u64 physical, u64 flags, u64 gen, int mirror_num,
			
 
				-		       u8 *csum, int force)
			
 
				+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
			
 
				+		       u64 physical, struct btrfs_device *dev, u64 flags,
			
 
				+		       u64 gen, int mirror_num, u8 *csum, int force,
			
 
				+		       u64 physical_for_dev_replace)
			
 
				 {
			
 
				 	struct scrub_block *sblock;
			
 
				 	int index;
			
 
				 
			
 
				 	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
			
 
				 	if (!sblock) {
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.malloc_errors++;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				-	/* one ref inside this function, plus one for each page later on */
			
 
				+	/* one ref inside this function, plus one for each page added to
			
 
				+	 * a bio later on */
			
 
				 	atomic_set(&sblock->ref_count, 1);
			
 
				-	sblock->sdev = sdev;
			
 
				+	sblock->sctx = sctx;
			
 
				 	sblock->no_io_error_seen = 1;
			
 
				 
			
 
				 	for (index = 0; len > 0; index++) {
			
 
				-		struct scrub_page *spage = sblock->pagev + index;
			
 
				+		struct scrub_page *spage;
			
 
				 		u64 l = min_t(u64, len, PAGE_SIZE);
			
 
				 
			
 
				-		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
			
 
				-		spage->page = alloc_page(GFP_NOFS);
			
 
				-		if (!spage->page) {
			
 
				-			spin_lock(&sdev->stat_lock);
			
 
				-			sdev->stat.malloc_errors++;
			
 
				-			spin_unlock(&sdev->stat_lock);
			
 
				-			while (index > 0) {
			
 
				-				index--;
			
 
				-				__free_page(sblock->pagev[index].page);
			
 
				-			}
			
 
				-			kfree(sblock);
			
 
				+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
			
 
				+		if (!spage) {
			
 
				+leave_nomem:
			
 
				+			spin_lock(&sctx->stat_lock);
			
 
				+			sctx->stat.malloc_errors++;
			
 
				+			spin_unlock(&sctx->stat_lock);
			
 
				+			scrub_block_put(sblock);
			
 
				 			return -ENOMEM;
			
 
				 		}
			
 
				+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
			
 
				+		scrub_page_get(spage);
			
 
				+		sblock->pagev[index] = spage;
			
 
				 		spage->sblock = sblock;
			
 
				-		spage->dev = sdev->dev;
			
 
				+		spage->dev = dev;
			
 
				 		spage->flags = flags;
			
 
				 		spage->generation = gen;
			
 
				 		spage->logical = logical;
			
 
				 		spage->physical = physical;
			
 
				+		spage->physical_for_dev_replace = physical_for_dev_replace;
			
 
				 		spage->mirror_num = mirror_num;
			
 
				 		if (csum) {
			
 
				 			spage->have_csum = 1;
			
 
				-			memcpy(spage->csum, csum, sdev->csum_size);
			
 
				+			memcpy(spage->csum, csum, sctx->csum_size);
			
 
				 		} else {
			
 
				 			spage->have_csum = 0;
			
 
				 		}
			
 
				 		sblock->page_count++;
			
 
				+		spage->page = alloc_page(GFP_NOFS);
			
 
				+		if (!spage->page)
			
 
				+			goto leave_nomem;
			
 
				 		len -= l;
			
 
				 		logical += l;
			
 
				 		physical += l;
			
 
				+		physical_for_dev_replace += l;
			
 
				 	}
			
 
				 
			
 
				-	BUG_ON(sblock->page_count == 0);
			
 
				+	WARN_ON(sblock->page_count == 0);
			
 
				 	for (index = 0; index < sblock->page_count; index++) {
			
 
				-		struct scrub_page *spage = sblock->pagev + index;
			
 
				+		struct scrub_page *spage = sblock->pagev[index];
			
 
				 		int ret;
			
 
				 
			
 
				-		ret = scrub_add_page_to_bio(sdev, spage);
			
 
				+		ret = scrub_add_page_to_rd_bio(sctx, spage);
			
 
				 		if (ret) {
			
 
				 			scrub_block_put(sblock);
			
 
				 			return ret;
			
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 	}
			
 
				 
			
 
				 	if (force)
			
 
				-		scrub_submit(sdev);
			
 
				+		scrub_submit(sctx);
			
 
				 
			
 
				 	/* last one frees, either here or in bio completion for last page */
			
 
				 	scrub_block_put(sblock);
			
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 static void scrub_bio_end_io(struct bio *bio, int err)
			
 
				 {
			
 
				 	struct scrub_bio *sbio = bio->bi_private;
			
 
				-	struct scrub_dev *sdev = sbio->sdev;
			
 
				-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
			
 
				+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
			
 
				 
			
 
				 	sbio->err = err;
			
 
				 	sbio->bio = bio;
			
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 
				 static void scrub_bio_end_io_worker(struct btrfs_work *work)
			
 
				 {
			
 
				 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
			
 
				-	struct scrub_dev *sdev = sbio->sdev;
			
 
				+	struct scrub_ctx *sctx = sbio->sctx;
			
 
				 	int i;
			
 
				 
			
 
				-	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
			
 
				+	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
			
 
				 	if (sbio->err) {
			
 
				 		for (i = 0; i < sbio->page_count; i++) {
			
 
				 			struct scrub_page *spage = sbio->pagev[i];
			
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 
				 
			
 
				 	bio_put(sbio->bio);
			
 
				 	sbio->bio = NULL;
			
 
				-	spin_lock(&sdev->list_lock);
			
 
				-	sbio->next_free = sdev->first_free;
			
 
				-	sdev->first_free = sbio->index;
			
 
				-	spin_unlock(&sdev->list_lock);
			
 
				-	atomic_dec(&sdev->in_flight);
			
 
				-	wake_up(&sdev->list_wait);
			
 
				+	spin_lock(&sctx->list_lock);
			
 
				+	sbio->next_free = sctx->first_free;
			
 
				+	sctx->first_free = sbio->index;
			
 
				+	spin_unlock(&sctx->list_lock);
			
 
				+
			
 
				+	if (sctx->is_dev_replace &&
			
 
				+	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
			
 
				+		mutex_lock(&sctx->wr_ctx.wr_lock);
			
 
				+		scrub_wr_submit(sctx);
			
 
				+		mutex_unlock(&sctx->wr_ctx.wr_lock);
			
 
				+	}
			
 
				+
			
 
				+	scrub_pending_bio_dec(sctx);
			
 
				 }
			
 
				 
			
 
				 static void scrub_block_complete(struct scrub_block *sblock)
			
 
				 {
			
 
				-	if (!sblock->no_io_error_seen)
			
 
				+	if (!sblock->no_io_error_seen) {
			
 
				 		scrub_handle_errored_block(sblock);
			
 
				-	else
			
 
				-		scrub_checksum(sblock);
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * if has checksum error, write via repair mechanism in
			
 
				+		 * dev replace case, otherwise write here in dev replace
			
 
				+		 * case.
			
 
				+		 */
			
 
				+		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
			
 
				+			scrub_write_block_to_dev_replace(sblock);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
			
 
				+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
			
 
				 			   u8 *csum)
			
 
				 {
			
 
				 	struct btrfs_ordered_sum *sum = NULL;
			
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 	unsigned long i;
			
 
				 	unsigned long num_sectors;
			
 
				 
			
 
				-	while (!list_empty(&sdev->csum_list)) {
			
 
				-		sum = list_first_entry(&sdev->csum_list,
			
 
				+	while (!list_empty(&sctx->csum_list)) {
			
 
				+		sum = list_first_entry(&sctx->csum_list,
			
 
				 				       struct btrfs_ordered_sum, list);
			
 
				 		if (sum->bytenr > logical)
			
 
				 			return 0;
			
 
				 		if (sum->bytenr + sum->len > logical)
			
 
				 			break;
			
 
				 
			
 
				-		++sdev->stat.csum_discards;
			
 
				+		++sctx->stat.csum_discards;
			
 
				 		list_del(&sum->list);
			
 
				 		kfree(sum);
			
 
				 		sum = NULL;
			
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 	if (!sum)
			
 
				 		return 0;
			
 
				 
			
 
				-	num_sectors = sum->len / sdev->sectorsize;
			
 
				+	num_sectors = sum->len / sctx->sectorsize;
			
 
				 	for (i = 0; i < num_sectors; ++i) {
			
 
				 		if (sum->sums[i].bytenr == logical) {
			
 
				-			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
			
 
				+			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
			
 
				 			ret = 1;
			
 
				 			break;
			
 
				 		}
			
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 }
			
 
				 
			
 
				 /* scrub extent tries to collect up to 64 kB for each bio */
			
 
				-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
			
 
				-			u64 physical, u64 flags, u64 gen, int mirror_num)
			
 
				+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
			
 
				+			u64 physical, struct btrfs_device *dev, u64 flags,
			
 
				+			u64 gen, int mirror_num, u64 physical_for_dev_replace)
			
 
				 {
			
 
				 	int ret;
			
 
				 	u8 csum[BTRFS_CSUM_SIZE];
			
 
				 	u32 blocksize;
			
 
				 
			
 
				 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
			
 
				-		blocksize = sdev->sectorsize;
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.data_extents_scrubbed++;
			
 
				-		sdev->stat.data_bytes_scrubbed += len;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		blocksize = sctx->sectorsize;
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.data_extents_scrubbed++;
			
 
				+		sctx->stat.data_bytes_scrubbed += len;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			
 
				-		BUG_ON(sdev->nodesize != sdev->leafsize);
			
 
				-		blocksize = sdev->nodesize;
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.tree_extents_scrubbed++;
			
 
				-		sdev->stat.tree_bytes_scrubbed += len;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		WARN_ON(sctx->nodesize != sctx->leafsize);
			
 
				+		blocksize = sctx->nodesize;
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.tree_extents_scrubbed++;
			
 
				+		sctx->stat.tree_bytes_scrubbed += len;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 	} else {
			
 
				-		blocksize = sdev->sectorsize;
			
 
				-		BUG_ON(1);
			
 
				+		blocksize = sctx->sectorsize;
			
 
				+		WARN_ON(1);
			
 
				 	}
			
 
				 
			
 
				 	while (len) {
			
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
 
				 
			
 
				 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
			
 
				 			/* push csums to sbio */
			
 
				-			have_csum = scrub_find_csum(sdev, logical, l, csum);
			
 
				+			have_csum = scrub_find_csum(sctx, logical, l, csum);
			
 
				 			if (have_csum == 0)
			
 
				-				++sdev->stat.no_csum;
			
 
				+				++sctx->stat.no_csum;
			
 
				+			if (sctx->is_dev_replace && !have_csum) {
			
 
				+				ret = copy_nocow_pages(sctx, logical, l,
			
 
				+						       mirror_num,
			
 
				+						      physical_for_dev_replace);
			
 
				+				goto behind_scrub_pages;
			
 
				+			}
			
 
				 		}
			
 
				-		ret = scrub_pages(sdev, logical, l, physical, flags, gen,
			
 
				-				  mirror_num, have_csum ? csum : NULL, 0);
			
 
				+		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
			
 
				+				  mirror_num, have_csum ? csum : NULL, 0,
			
 
				+				  physical_for_dev_replace);
			
 
				+behind_scrub_pages:
			
 
				 		if (ret)
			
 
				 			return ret;
			
 
				 		len -= l;
			
 
				 		logical += l;
			
 
				 		physical += l;
			
 
				+		physical_for_dev_replace += l;
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
			
 
				-	struct map_lookup *map, int num, u64 base, u64 length)
			
 
				+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
			
 
				+					   struct map_lookup *map,
			
 
				+					   struct btrfs_device *scrub_dev,
			
 
				+					   int num, u64 base, u64 length,
			
 
				+					   int is_dev_replace)
			
 
				 {
			
 
				 	struct btrfs_path *path;
			
 
				-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
			
 
				+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
			
 
				 	struct btrfs_root *root = fs_info->extent_root;
			
 
				 	struct btrfs_root *csum_root = fs_info->csum_root;
			
 
				 	struct btrfs_extent_item *extent;
			
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 	struct reada_control *reada2;
			
 
				 	struct btrfs_key key_start;
			
 
				 	struct btrfs_key key_end;
			
 
				-
			
 
				 	u64 increment = map->stripe_len;
			
 
				 	u64 offset;
			
 
				+	u64 extent_logical;
			
 
				+	u64 extent_physical;
			
 
				+	u64 extent_len;
			
 
				+	struct btrfs_device *extent_dev;
			
 
				+	int extent_mirror_num;
			
 
				 
			
 
				 	nstripes = length;
			
 
				 	offset = 0;
			
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 	 */
			
 
				 	logical = base + offset;
			
 
				 
			
 
				-	wait_event(sdev->list_wait,
			
 
				-		   atomic_read(&sdev->in_flight) == 0);
			
 
				+	wait_event(sctx->list_wait,
			
 
				+		   atomic_read(&sctx->bios_in_flight) == 0);
			
 
				 	atomic_inc(&fs_info->scrubs_paused);
			
 
				 	wake_up(&fs_info->scrub_pause_wait);
			
 
				 
			
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 		 * canceled?
			
 
				 		 */
			
 
				 		if (atomic_read(&fs_info->scrub_cancel_req) ||
			
 
				-		    atomic_read(&sdev->cancel_req)) {
			
 
				+		    atomic_read(&sctx->cancel_req)) {
			
 
				 			ret = -ECANCELED;
			
 
				 			goto out;
			
 
				 		}
			
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 		 */
			
 
				 		if (atomic_read(&fs_info->scrub_pause_req)) {
			
 
				 			/* push queued extents */
			
 
				-			scrub_submit(sdev);
			
 
				-			wait_event(sdev->list_wait,
			
 
				-				   atomic_read(&sdev->in_flight) == 0);
			
 
				+			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
			
 
				+			scrub_submit(sctx);
			
 
				+			mutex_lock(&sctx->wr_ctx.wr_lock);
			
 
				+			scrub_wr_submit(sctx);
			
 
				+			mutex_unlock(&sctx->wr_ctx.wr_lock);
			
 
				+			wait_event(sctx->list_wait,
			
 
				+				   atomic_read(&sctx->bios_in_flight) == 0);
			
 
				+			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
			
 
				 			atomic_inc(&fs_info->scrubs_paused);
			
 
				 			wake_up(&fs_info->scrub_pause_wait);
			
 
				 			mutex_lock(&fs_info->scrub_lock);
			
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 
			
 
				 		ret = btrfs_lookup_csums_range(csum_root, logical,
			
 
				 					       logical + map->stripe_len - 1,
			
 
				-					       &sdev->csum_list, 1);
			
 
				+					       &sctx->csum_list, 1);
			
 
				 		if (ret)
			
 
				 			goto out;
			
 
				 
			
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 					     key.objectid;
			
 
				 			}
			
 
				 
			
 
				-			ret = scrub_extent(sdev, key.objectid, key.offset,
			
 
				-					   key.objectid - logical + physical,
			
 
				-					   flags, generation, mirror_num);
			
 
				+			extent_logical = key.objectid;
			
 
				+			extent_physical = key.objectid - logical + physical;
			
 
				+			extent_len = key.offset;
			
 
				+			extent_dev = scrub_dev;
			
 
				+			extent_mirror_num = mirror_num;
			
 
				+			if (is_dev_replace)
			
 
				+				scrub_remap_extent(fs_info, extent_logical,
			
 
				+						   extent_len, &extent_physical,
			
 
				+						   &extent_dev,
			
 
				+						   &extent_mirror_num);
			
 
				+			ret = scrub_extent(sctx, extent_logical, extent_len,
			
 
				+					   extent_physical, extent_dev, flags,
			
 
				+					   generation, extent_mirror_num,
			
 
				+					   key.objectid - logical + physical);
			
 
				 			if (ret)
			
 
				 				goto out;
			
 
				 
			
@@ -2016,29 +2477,34 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 
				 		btrfs_release_path(path);
			
 
				 		logical += increment;
			
 
				 		physical += map->stripe_len;
			
 
				-		spin_lock(&sdev->stat_lock);
			
 
				-		sdev->stat.last_physical = physical;
			
 
				-		spin_unlock(&sdev->stat_lock);
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.last_physical = physical;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				 	}
			
 
				+out:
			
 
				 	/* push queued extents */
			
 
				-	scrub_submit(sdev);
			
 
				+	scrub_submit(sctx);
			
 
				+	mutex_lock(&sctx->wr_ctx.wr_lock);
			
 
				+	scrub_wr_submit(sctx);
			
 
				+	mutex_unlock(&sctx->wr_ctx.wr_lock);
			
 
				 
			
 
				-out:
			
 
				 	blk_finish_plug(&plug);
			
 
				 	btrfs_free_path(path);
			
 
				 	return ret < 0 ? ret : 0;
			
 
				 }
			
 
				 
			
 
				-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
			
 
				-	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
			
 
				-	u64 dev_offset)
			
 
				+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
			
 
				+					  struct btrfs_device *scrub_dev,
			
 
				+					  u64 chunk_tree, u64 chunk_objectid,
			
 
				+					  u64 chunk_offset, u64 length,
			
 
				+					  u64 dev_offset, int is_dev_replace)
			
 
				 {
			
 
				 	struct btrfs_mapping_tree *map_tree =
			
 
				-		&sdev->dev->dev_root->fs_info->mapping_tree;
			
 
				+		&sctx->dev_root->fs_info->mapping_tree;
			
 
				 	struct map_lookup *map;
			
 
				 	struct extent_map *em;
			
 
				 	int i;
			
 
				-	int ret = -EINVAL;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	read_lock(&map_tree->map_tree.lock);
			
 
				 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
			
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
 
				 		goto out;
			
 
				 
			
 
				 	for (i = 0; i < map->num_stripes; ++i) {
			
 
				-		if (map->stripes[i].dev == sdev->dev &&
			
 
				+		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
			
 
				 		    map->stripes[i].physical == dev_offset) {
			
 
				-			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
			
 
				+			ret = scrub_stripe(sctx, map, scrub_dev, i,
			
 
				+					   chunk_offset, length,
			
 
				+					   is_dev_replace);
			
 
				 			if (ret)
			
 
				 				goto out;
			
 
				 		}
			
@@ -2069,11 +2537,13 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
 
				 }
			
 
				 
			
 
				 static noinline_for_stack
			
 
				-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
			
 
				+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
			
 
				+			   struct btrfs_device *scrub_dev, u64 start, u64 end,
			
 
				+			   int is_dev_replace)
			
 
				 {
			
 
				 	struct btrfs_dev_extent *dev_extent = NULL;
			
 
				 	struct btrfs_path *path;
			
 
				-	struct btrfs_root *root = sdev->dev->dev_root;
			
 
				+	struct btrfs_root *root = sctx->dev_root;
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	u64 length;
			
 
				 	u64 chunk_tree;
			
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_key found_key;
			
 
				 	struct btrfs_block_group_cache *cache;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 
				 	path->search_commit_root = 1;
			
 
				 	path->skip_locking = 1;
			
 
				 
			
 
				-	key.objectid = sdev->dev->devid;
			
 
				+	key.objectid = scrub_dev->devid;
			
 
				 	key.offset = 0ull;
			
 
				 	key.type = BTRFS_DEV_EXTENT_KEY;
			
 
				 
			
 
				-
			
 
				 	while (1) {
			
 
				 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				 		if (ret < 0)
			
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 
				 
			
 
				 		btrfs_item_key_to_cpu(l, &found_key, slot);
			
 
				 
			
 
				-		if (found_key.objectid != sdev->dev->devid)
			
 
				+		if (found_key.objectid != scrub_dev->devid)
			
 
				 			break;
			
 
				 
			
 
				 		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
			
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 
				 			ret = -ENOENT;
			
 
				 			break;
			
 
				 		}
			
 
				-		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
			
 
				-				  chunk_offset, length, found_key.offset);
			
 
				+		dev_replace->cursor_right = found_key.offset + length;
			
 
				+		dev_replace->cursor_left = found_key.offset;
			
 
				+		dev_replace->item_needs_writeback = 1;
			
 
				+		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
			
 
				+				  chunk_offset, length, found_key.offset,
			
 
				+				  is_dev_replace);
			
 
				+
			
 
				+		/*
			
 
				+		 * flush, submit all pending read and write bios, afterwards
			
 
				+		 * wait for them.
			
 
				+		 * Note that in the dev replace case, a read request causes
			
 
				+		 * write requests that are submitted in the read completion
			
 
				+		 * worker. Therefore in the current situation, it is required
			
 
				+		 * that all write requests are flushed, so that all read and
			
 
				+		 * write requests are really completed when bios_in_flight
			
 
				+		 * changes to 0.
			
 
				+		 */
			
 
				+		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
			
 
				+		scrub_submit(sctx);
			
 
				+		mutex_lock(&sctx->wr_ctx.wr_lock);
			
 
				+		scrub_wr_submit(sctx);
			
 
				+		mutex_unlock(&sctx->wr_ctx.wr_lock);
			
 
				+
			
 
				+		wait_event(sctx->list_wait,
			
 
				+			   atomic_read(&sctx->bios_in_flight) == 0);
			
 
				+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
			
 
				+		atomic_inc(&fs_info->scrubs_paused);
			
 
				+		wake_up(&fs_info->scrub_pause_wait);
			
 
				+		wait_event(sctx->list_wait,
			
 
				+			   atomic_read(&sctx->workers_pending) == 0);
			
 
				+
			
 
				+		mutex_lock(&fs_info->scrub_lock);
			
 
				+		while (atomic_read(&fs_info->scrub_pause_req)) {
			
 
				+			mutex_unlock(&fs_info->scrub_lock);
			
 
				+			wait_event(fs_info->scrub_pause_wait,
			
 
				+			   atomic_read(&fs_info->scrub_pause_req) == 0);
			
 
				+			mutex_lock(&fs_info->scrub_lock);
			
 
				+		}
			
 
				+		atomic_dec(&fs_info->scrubs_paused);
			
 
				+		mutex_unlock(&fs_info->scrub_lock);
			
 
				+		wake_up(&fs_info->scrub_pause_wait);
			
 
				+
			
 
				+		dev_replace->cursor_left = dev_replace->cursor_right;
			
 
				+		dev_replace->item_needs_writeback = 1;
			
 
				 		btrfs_put_block_group(cache);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				+		if (is_dev_replace &&
			
 
				+		    atomic64_read(&dev_replace->num_write_errors) > 0) {
			
 
				+			ret = -EIO;
			
 
				+			break;
			
 
				+		}
			
 
				+		if (sctx->stat.malloc_errors > 0) {
			
 
				+			ret = -ENOMEM;
			
 
				+			break;
			
 
				+		}
			
 
				 
			
 
				 		key.offset = found_key.offset + length;
			
 
				 		btrfs_release_path(path);
			
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
 
				 	return ret < 0 ? ret : 0;
			
 
				 }
			
 
				 
			
 
				-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
			
 
				+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
			
 
				+					   struct btrfs_device *scrub_dev)
			
 
				 {
			
 
				 	int	i;
			
 
				 	u64	bytenr;
			
 
				 	u64	gen;
			
 
				 	int	ret;
			
 
				-	struct btrfs_device *device = sdev->dev;
			
 
				-	struct btrfs_root *root = device->dev_root;
			
 
				+	struct btrfs_root *root = sctx->dev_root;
			
 
				 
			
 
				 	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
			
 
				 		return -EIO;
			
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 
				 
			
 
				 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
			
 
				 		bytenr = btrfs_sb_offset(i);
			
 
				-		if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
			
 
				+		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
			
 
				 			break;
			
 
				 
			
 
				-		ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
			
 
				-				     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
			
 
				+		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
			
 
				+				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
			
 
				+				  NULL, 1, bytenr);
			
 
				 		if (ret)
			
 
				 			return ret;
			
 
				 	}
			
 
				-	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
			
 
				+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 
				 /*
			
 
				  * get a reference count on fs_info->scrub_workers. start worker if necessary
			
 
				  */
			
 
				-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
			
 
				+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
			
 
				+						int is_dev_replace)
			
 
				 {
			
 
				-	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	mutex_lock(&fs_info->scrub_lock);
			
 
				 	if (fs_info->scrub_workers_refcnt == 0) {
			
 
				-		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
			
 
				-			   fs_info->thread_pool_size, &fs_info->generic_worker);
			
 
				+		if (is_dev_replace)
			
 
				+			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
			
 
				+					&fs_info->generic_worker);
			
 
				+		else
			
 
				+			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
			
 
				+					fs_info->thread_pool_size,
			
 
				+					&fs_info->generic_worker);
			
 
				 		fs_info->scrub_workers.idle_thresh = 4;
			
 
				 		ret = btrfs_start_workers(&fs_info->scrub_workers);
			
 
				 		if (ret)
			
 
				 			goto out;
			
 
				+		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
			
 
				+				   "scrubwrc",
			
 
				+				   fs_info->thread_pool_size,
			
 
				+				   &fs_info->generic_worker);
			
 
				+		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
			
 
				+		ret = btrfs_start_workers(
			
 
				+				&fs_info->scrub_wr_completion_workers);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
			
 
				+				   &fs_info->generic_worker);
			
 
				+		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				 	}
			
 
				 	++fs_info->scrub_workers_refcnt;
			
 
				 out:
			
@@ -2223,40 +2764,41 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
			
 
				+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
			
 
				 {
			
 
				-	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				-
			
 
				 	mutex_lock(&fs_info->scrub_lock);
			
 
				-	if (--fs_info->scrub_workers_refcnt == 0)
			
 
				+	if (--fs_info->scrub_workers_refcnt == 0) {
			
 
				 		btrfs_stop_workers(&fs_info->scrub_workers);
			
 
				+		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
			
 
				+		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
			
 
				+	}
			
 
				 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
			
 
				 	mutex_unlock(&fs_info->scrub_lock);
			
 
				 }
			
 
				 
			
 
				-
			
 
				-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
			
 
				-		    struct btrfs_scrub_progress *progress, int readonly)
			
 
				+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
			
 
				+		    u64 end, struct btrfs_scrub_progress *progress,
			
 
				+		    int readonly, int is_dev_replace)
			
 
				 {
			
 
				-	struct scrub_dev *sdev;
			
 
				-	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct scrub_ctx *sctx;
			
 
				 	int ret;
			
 
				 	struct btrfs_device *dev;
			
 
				 
			
 
				-	if (btrfs_fs_closing(root->fs_info))
			
 
				+	if (btrfs_fs_closing(fs_info))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	/*
			
 
				 	 * check some assumptions
			
 
				 	 */
			
 
				-	if (root->nodesize != root->leafsize) {
			
 
				+	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
			
 
				 		printk(KERN_ERR
			
 
				 		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
			
 
				-		       root->nodesize, root->leafsize);
			
 
				+		       fs_info->chunk_root->nodesize,
			
 
				+		       fs_info->chunk_root->leafsize);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if (root->nodesize > BTRFS_STRIPE_LEN) {
			
 
				+	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
			
 
				 		/*
			
 
				 		 * in this case scrub is unable to calculate the checksum
			
 
				 		 * the way scrub is implemented. Do not handle this
			
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 
				 		 */
			
 
				 		printk(KERN_ERR
			
 
				 		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
			
 
				-		       root->nodesize, BTRFS_STRIPE_LEN);
			
 
				+		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if (root->sectorsize != PAGE_SIZE) {
			
 
				+	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
			
 
				 		/* not supported for data w/o checksums */
			
 
				 		printk(KERN_ERR
			
 
				 		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
			
 
				-		       root->sectorsize, (unsigned long long)PAGE_SIZE);
			
 
				+		       fs_info->chunk_root->sectorsize,
			
 
				+		       (unsigned long long)PAGE_SIZE);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	ret = scrub_workers_get(root);
			
 
				+	if (fs_info->chunk_root->nodesize >
			
 
				+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
			
 
				+	    fs_info->chunk_root->sectorsize >
			
 
				+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
			
 
				+		/*
			
 
				+		 * would exhaust the array bounds of pagev member in
			
 
				+		 * struct scrub_block
			
 
				+		 */
			
 
				+		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
			
 
				+		       fs_info->chunk_root->nodesize,
			
 
				+		       SCRUB_MAX_PAGES_PER_BLOCK,
			
 
				+		       fs_info->chunk_root->sectorsize,
			
 
				+		       SCRUB_MAX_PAGES_PER_BLOCK);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	ret = scrub_workers_get(fs_info, is_dev_replace);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				-	dev = btrfs_find_device(root, devid, NULL, NULL);
			
 
				-	if (!dev || dev->missing) {
			
 
				-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				-		scrub_workers_put(root);
			
 
				+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
			
 
				+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
			
 
				+	if (!dev || (dev->missing && !is_dev_replace)) {
			
 
				+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				+		scrub_workers_put(fs_info);
			
 
				 		return -ENODEV;
			
 
				 	}
			
 
				 	mutex_lock(&fs_info->scrub_lock);
			
 
				 
			
 
				-	if (!dev->in_fs_metadata) {
			
 
				+	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
 
				-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				-		scrub_workers_put(root);
			
 
				-		return -ENODEV;
			
 
				+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				+		scrub_workers_put(fs_info);
			
 
				+		return -EIO;
			
 
				 	}
			
 
				 
			
 
				-	if (dev->scrub_device) {
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	if (dev->scrub_device ||
			
 
				+	    (!is_dev_replace &&
			
 
				+	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
			
 
				+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
 
				-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				-		scrub_workers_put(root);
			
 
				+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				+		scrub_workers_put(fs_info);
			
 
				 		return -EINPROGRESS;
			
 
				 	}
			
 
				-	sdev = scrub_setup_dev(dev);
			
 
				-	if (IS_ERR(sdev)) {
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+	sctx = scrub_setup_ctx(dev, is_dev_replace);
			
 
				+	if (IS_ERR(sctx)) {
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
 
				-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				-		scrub_workers_put(root);
			
 
				-		return PTR_ERR(sdev);
			
 
				+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				+		scrub_workers_put(fs_info);
			
 
				+		return PTR_ERR(sctx);
			
 
				 	}
			
 
				-	sdev->readonly = readonly;
			
 
				-	dev->scrub_device = sdev;
			
 
				+	sctx->readonly = readonly;
			
 
				+	dev->scrub_device = sctx;
			
 
				 
			
 
				 	atomic_inc(&fs_info->scrubs_running);
			
 
				 	mutex_unlock(&fs_info->scrub_lock);
			
 
				-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				-	down_read(&fs_info->scrub_super_lock);
			
 
				-	ret = scrub_supers(sdev);
			
 
				-	up_read(&fs_info->scrub_super_lock);
			
 
				+	if (!is_dev_replace) {
			
 
				+		down_read(&fs_info->scrub_super_lock);
			
 
				+		ret = scrub_supers(sctx, dev);
			
 
				+		up_read(&fs_info->scrub_super_lock);
			
 
				+	}
			
 
				 
			
 
				 	if (!ret)
			
 
				-		ret = scrub_enumerate_chunks(sdev, start, end);
			
 
				+		ret = scrub_enumerate_chunks(sctx, dev, start, end,
			
 
				+					     is_dev_replace);
			
 
				 
			
 
				-	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
			
 
				+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
			
 
				 	atomic_dec(&fs_info->scrubs_running);
			
 
				 	wake_up(&fs_info->scrub_pause_wait);
			
 
				 
			
 
				-	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
			
 
				+	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
			
 
				 
			
 
				 	if (progress)
			
 
				-		memcpy(progress, &sdev->stat, sizeof(*progress));
			
 
				+		memcpy(progress, &sctx->stat, sizeof(*progress));
			
 
				 
			
 
				 	mutex_lock(&fs_info->scrub_lock);
			
 
				 	dev->scrub_device = NULL;
			
 
				 	mutex_unlock(&fs_info->scrub_lock);
			
 
				 
			
 
				-	scrub_free_dev(sdev);
			
 
				-	scrub_workers_put(root);
			
 
				+	scrub_free_ctx(sctx);
			
 
				+	scrub_workers_put(fs_info);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
 
				 	up_write(&root->fs_info->scrub_super_lock);
			
 
				 }
			
 
				 
			
 
				-int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
			
 
				+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
			
 
				 {
			
 
				-
			
 
				 	mutex_lock(&fs_info->scrub_lock);
			
 
				 	if (!atomic_read(&fs_info->scrubs_running)) {
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int btrfs_scrub_cancel(struct btrfs_root *root)
			
 
				+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
			
 
				+			   struct btrfs_device *dev)
			
 
				 {
			
 
				-	return __btrfs_scrub_cancel(root->fs_info);
			
 
				-}
			
 
				-
			
 
				-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
			
 
				-{
			
 
				-	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				-	struct scrub_dev *sdev;
			
 
				+	struct scrub_ctx *sctx;
			
 
				 
			
 
				 	mutex_lock(&fs_info->scrub_lock);
			
 
				-	sdev = dev->scrub_device;
			
 
				-	if (!sdev) {
			
 
				+	sctx = dev->scrub_device;
			
 
				+	if (!sctx) {
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
 
				 		return -ENOTCONN;
			
 
				 	}
			
 
				-	atomic_inc(&sdev->cancel_req);
			
 
				+	atomic_inc(&sctx->cancel_req);
			
 
				 	while (dev->scrub_device) {
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
 
				 		wait_event(fs_info->scrub_pause_wait,
			
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
 
				 	 * does not go away in cancel_dev. FIXME: find a better solution
			
 
				 	 */
			
 
				 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
			
 
				-	dev = btrfs_find_device(root, devid, NULL, NULL);
			
 
				+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
			
 
				 	if (!dev) {
			
 
				 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				 		return -ENODEV;
			
 
				 	}
			
 
				-	ret = btrfs_scrub_cancel_dev(root, dev);
			
 
				+	ret = btrfs_scrub_cancel_dev(fs_info, dev);
			
 
				 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				 	return ret;
			
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 
				 			 struct btrfs_scrub_progress *progress)
			
 
				 {
			
 
				 	struct btrfs_device *dev;
			
 
				-	struct scrub_dev *sdev = NULL;
			
 
				+	struct scrub_ctx *sctx = NULL;
			
 
				 
			
 
				 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				-	dev = btrfs_find_device(root, devid, NULL, NULL);
			
 
				+	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
			
 
				 	if (dev)
			
 
				-		sdev = dev->scrub_device;
			
 
				-	if (sdev)
			
 
				-		memcpy(progress, &sdev->stat, sizeof(*progress));
			
 
				+		sctx = dev->scrub_device;
			
 
				+	if (sctx)
			
 
				+		memcpy(progress, &sctx->stat, sizeof(*progress));
			
 
				 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				-	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
			
 
				+	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
			
 
				+}
			
 
				+
			
 
				+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
			
 
				+			       u64 extent_logical, u64 extent_len,
			
 
				+			       u64 *extent_physical,
			
 
				+			       struct btrfs_device **extent_dev,
			
 
				+			       int *extent_mirror_num)
			
 
				+{
			
 
				+	u64 mapped_length;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				+	int ret;
			
 
				+
			
 
				+	mapped_length = extent_len;
			
 
				+	ret = btrfs_map_block(fs_info, READ, extent_logical,
			
 
				+			      &mapped_length, &bbio, 0);
			
 
				+	if (ret || !bbio || mapped_length < extent_len ||
			
 
				+	    !bbio->stripes[0].dev->bdev) {
			
 
				+		kfree(bbio);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	*extent_physical = bbio->stripes[0].physical;
			
 
				+	*extent_mirror_num = bbio->mirror_num;
			
 
				+	*extent_dev = bbio->stripes[0].dev;
			
 
				+	kfree(bbio);
			
 
				+}
			
 
				+
			
 
				+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
			
 
				+			      struct scrub_wr_ctx *wr_ctx,
			
 
				+			      struct btrfs_fs_info *fs_info,
			
 
				+			      struct btrfs_device *dev,
			
 
				+			      int is_dev_replace)
			
 
				+{
			
 
				+	WARN_ON(wr_ctx->wr_curr_bio != NULL);
			
 
				+
			
 
				+	mutex_init(&wr_ctx->wr_lock);
			
 
				+	wr_ctx->wr_curr_bio = NULL;
			
 
				+	if (!is_dev_replace)
			
 
				+		return 0;
			
 
				+
			
 
				+	WARN_ON(!dev->bdev);
			
 
				+	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
			
 
				+					 bio_get_nr_vecs(dev->bdev));
			
 
				+	wr_ctx->tgtdev = dev;
			
 
				+	atomic_set(&wr_ctx->flush_all_writes, 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
			
 
				+{
			
 
				+	mutex_lock(&wr_ctx->wr_lock);
			
 
				+	kfree(wr_ctx->wr_curr_bio);
			
 
				+	wr_ctx->wr_curr_bio = NULL;
			
 
				+	mutex_unlock(&wr_ctx->wr_lock);
			
 
				+}
			
 
				+
			
 
				+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
			
 
				+			    int mirror_num, u64 physical_for_dev_replace)
			
 
				+{
			
 
				+	struct scrub_copy_nocow_ctx *nocow_ctx;
			
 
				+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
			
 
				+
			
 
				+	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
			
 
				+	if (!nocow_ctx) {
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	scrub_pending_trans_workers_inc(sctx);
			
 
				+
			
 
				+	nocow_ctx->sctx = sctx;
			
 
				+	nocow_ctx->logical = logical;
			
 
				+	nocow_ctx->len = len;
			
 
				+	nocow_ctx->mirror_num = mirror_num;
			
 
				+	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
			
 
				+	nocow_ctx->work.func = copy_nocow_pages_worker;
			
 
				+	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
			
 
				+			   &nocow_ctx->work);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void copy_nocow_pages_worker(struct btrfs_work *work)
			
 
				+{
			
 
				+	struct scrub_copy_nocow_ctx *nocow_ctx =
			
 
				+		container_of(work, struct scrub_copy_nocow_ctx, work);
			
 
				+	struct scrub_ctx *sctx = nocow_ctx->sctx;
			
 
				+	u64 logical = nocow_ctx->logical;
			
 
				+	u64 len = nocow_ctx->len;
			
 
				+	int mirror_num = nocow_ctx->mirror_num;
			
 
				+	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
			
 
				+	int ret;
			
 
				+	struct btrfs_trans_handle *trans = NULL;
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_root *root;
			
 
				+	int not_written = 0;
			
 
				+
			
 
				+	fs_info = sctx->dev_root->fs_info;
			
 
				+	root = fs_info->extent_root;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		not_written = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	trans = btrfs_join_transaction(root);
			
 
				+	if (IS_ERR(trans)) {
			
 
				+		not_written = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	ret = iterate_inodes_from_logical(logical, fs_info, path,
			
 
				+					  copy_nocow_pages_for_inode,
			
 
				+					  nocow_ctx);
			
 
				+	if (ret != 0 && ret != -ENOENT) {
			
 
				+		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
			
 
				+			(unsigned long long)logical,
			
 
				+			(unsigned long long)physical_for_dev_replace,
			
 
				+			(unsigned long long)len,
			
 
				+			(unsigned long long)mirror_num, ret);
			
 
				+		not_written = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	if (trans && !IS_ERR(trans))
			
 
				+		btrfs_end_transaction(trans, root);
			
 
				+	if (not_written)
			
 
				+		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
			
 
				+					    num_uncorrectable_read_errors);
			
 
				+
			
 
				+	btrfs_free_path(path);
			
 
				+	kfree(nocow_ctx);
			
 
				+
			
 
				+	scrub_pending_trans_workers_dec(sctx);
			
 
				+}
			
 
				+
			
 
				+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
			
 
				+{
			
 
				+	unsigned long index;
			
 
				+	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
			
 
				+	int ret = 0;
			
 
				+	struct btrfs_key key;
			
 
				+	struct inode *inode = NULL;
			
 
				+	struct btrfs_root *local_root;
			
 
				+	u64 physical_for_dev_replace;
			
 
				+	u64 len;
			
 
				+	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
			
 
				+
			
 
				+	key.objectid = root;
			
 
				+	key.type = BTRFS_ROOT_ITEM_KEY;
			
 
				+	key.offset = (u64)-1;
			
 
				+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
			
 
				+	if (IS_ERR(local_root))
			
 
				+		return PTR_ERR(local_root);
			
 
				+
			
 
				+	key.type = BTRFS_INODE_ITEM_KEY;
			
 
				+	key.objectid = inum;
			
 
				+	key.offset = 0;
			
 
				+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
			
 
				+	if (IS_ERR(inode))
			
 
				+		return PTR_ERR(inode);
			
 
				+
			
 
				+	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
			
 
				+	len = nocow_ctx->len;
			
 
				+	while (len >= PAGE_CACHE_SIZE) {
			
 
				+		struct page *page = NULL;
			
 
				+		int ret_sub;
			
 
				+
			
 
				+		index = offset >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
			
 
				+		if (!page) {
			
 
				+			pr_err("find_or_create_page() failed\n");
			
 
				+			ret = -ENOMEM;
			
 
				+			goto next_page;
			
 
				+		}
			
 
				+
			
 
				+		if (PageUptodate(page)) {
			
 
				+			if (PageDirty(page))
			
 
				+				goto next_page;
			
 
				+		} else {
			
 
				+			ClearPageError(page);
			
 
				+			ret_sub = extent_read_full_page(&BTRFS_I(inode)->
			
 
				+							 io_tree,
			
 
				+							page, btrfs_get_extent,
			
 
				+							nocow_ctx->mirror_num);
			
 
				+			if (ret_sub) {
			
 
				+				ret = ret_sub;
			
 
				+				goto next_page;
			
 
				+			}
			
 
				+			wait_on_page_locked(page);
			
 
				+			if (!PageUptodate(page)) {
			
 
				+				ret = -EIO;
			
 
				+				goto next_page;
			
 
				+			}
			
 
				+		}
			
 
				+		ret_sub = write_page_nocow(nocow_ctx->sctx,
			
 
				+					   physical_for_dev_replace, page);
			
 
				+		if (ret_sub) {
			
 
				+			ret = ret_sub;
			
 
				+			goto next_page;
			
 
				+		}
			
 
				+
			
 
				+next_page:
			
 
				+		if (page) {
			
 
				+			unlock_page(page);
			
 
				+			put_page(page);
			
 
				+		}
			
 
				+		offset += PAGE_CACHE_SIZE;
			
 
				+		physical_for_dev_replace += PAGE_CACHE_SIZE;
			
 
				+		len -= PAGE_CACHE_SIZE;
			
 
				+	}
			
 
				+
			
 
				+	if (inode)
			
 
				+		iput(inode);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int write_page_nocow(struct scrub_ctx *sctx,
			
 
				+			    u64 physical_for_dev_replace, struct page *page)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct btrfs_device *dev;
			
 
				+	int ret;
			
 
				+	DECLARE_COMPLETION_ONSTACK(compl);
			
 
				+
			
 
				+	dev = sctx->wr_ctx.tgtdev;
			
 
				+	if (!dev)
			
 
				+		return -EIO;
			
 
				+	if (!dev->bdev) {
			
 
				+		printk_ratelimited(KERN_WARNING
			
 
				+			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+	bio = bio_alloc(GFP_NOFS, 1);
			
 
				+	if (!bio) {
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	bio->bi_private = &compl;
			
 
				+	bio->bi_end_io = scrub_complete_bio_end_io;
			
 
				+	bio->bi_size = 0;
			
 
				+	bio->bi_sector = physical_for_dev_replace >> 9;
			
 
				+	bio->bi_bdev = dev->bdev;
			
 
				+	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
			
 
				+	if (ret != PAGE_CACHE_SIZE) {
			
 
				+leave_with_eio:
			
 
				+		bio_put(bio);
			
 
				+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+	btrfsic_submit_bio(WRITE_SYNC, bio);
			
 
				+	wait_for_completion(&compl);
			
 
				+
			
 
				+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
			
 
				+		goto leave_with_eio;
			
 
				+
			
 
				+	bio_put(bio);
			
 
				+	return 0;
			
 
				 }
			
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	spin_lock(&send_root->root_times_lock);
			
 
				+	spin_lock(&send_root->root_item_lock);
			
 
				 	start_ctransid = btrfs_root_ctransid(&send_root->root_item);
			
 
				-	spin_unlock(&send_root->root_times_lock);
			
 
				+	spin_unlock(&send_root->root_item_lock);
			
 
				 
			
 
				 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
			
 
				 	key.type = BTRFS_INODE_ITEM_KEY;
			
@@ -4422,9 +4422,9 @@ static int full_send_tree(struct send_ctx *sctx)
 
				 	 * Make sure the tree has not changed after re-joining. We detect this
			
 
				 	 * by comparing start_ctransid and ctransid. They should always match.
			
 
				 	 */
			
 
				-	spin_lock(&send_root->root_times_lock);
			
 
				+	spin_lock(&send_root->root_item_lock);
			
 
				 	ctransid = btrfs_root_ctransid(&send_root->root_item);
			
 
				-	spin_unlock(&send_root->root_times_lock);
			
 
				+	spin_unlock(&send_root->root_item_lock);
			
 
				 
			
 
				 	if (ctransid != start_ctransid) {
			
 
				 		WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
			
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
 
				 #include "export.h"
			
 
				 #include "compression.h"
			
 
				 #include "rcu-string.h"
			
 
				+#include "dev-replace.h"
			
 
				 
			
 
				 #define CREATE_TRACE_POINTS
			
 
				 #include <trace/events/btrfs.h>
			
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 
				 	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
			
 
				 		sb->s_flags |= MS_RDONLY;
			
 
				 		printk(KERN_INFO "btrfs is forced readonly\n");
			
 
				-		__btrfs_scrub_cancel(fs_info);
			
 
				+		/*
			
 
				+		 * Note that a running device replace operation is not
			
 
				+		 * canceled here although there is no way to update
			
 
				+		 * the progress. It would add the risk of a deadlock,
			
 
				+		 * therefore the canceling is ommited. The only penalty
			
 
				+		 * is that some I/O remains active until the procedure
			
 
				+		 * completes. The next time when the filesystem is
			
 
				+		 * mounted writeable again, the device replace
			
 
				+		 * operation continues.
			
 
				+		 */
			
 
				 //		WARN_ON(1);
			
 
				 	}
			
 
				 }
			
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 
				 	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
			
 
				 	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
			
 
				 	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
			
 
				-	btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
			
 
				+	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
			
 
				+			      new_pool_size);
			
 
				 }
			
 
				 
			
 
				 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
			
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
				 		return 0;
			
 
				 
			
 
				 	if (*flags & MS_RDONLY) {
			
 
				+		/*
			
 
				+		 * this also happens on 'umount -rf' or on shutdown, when
			
 
				+		 * the filesystem is busy.
			
 
				+		 */
			
 
				 		sb->s_flags |= MS_RDONLY;
			
 
				 
			
 
				+		btrfs_dev_replace_suspend_for_unmount(fs_info);
			
 
				+		btrfs_scrub_cancel(fs_info);
			
 
				+
			
 
				 		ret = btrfs_commit_super(root);
			
 
				 		if (ret)
			
 
				 			goto restore;
			
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
				 			goto restore;
			
 
				 		}
			
 
				 
			
 
				+		if (fs_info->fs_devices->missing_devices >
			
 
				+		     fs_info->num_tolerated_disk_barrier_failures &&
			
 
				+		    !(*flags & MS_RDONLY)) {
			
 
				+			printk(KERN_WARNING
			
 
				+			       "Btrfs: too many missing devices, writeable remount is not allowed\n");
			
 
				+			ret = -EACCES;
			
 
				+			goto restore;
			
 
				+		}
			
 
				+
			
 
				 		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
			
 
				 			ret = -EINVAL;
			
 
				 			goto restore;
			
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
				 		if (ret)
			
 
				 			goto restore;
			
 
				 
			
 
				+		ret = btrfs_resume_dev_replace_async(fs_info);
			
 
				+		if (ret) {
			
 
				+			pr_warn("btrfs: failed to resume dev_replace\n");
			
 
				+			goto restore;
			
 
				+		}
			
 
				 		sb->s_flags &= ~MS_RDONLY;
			
 
				 	}
			
 
				 
			
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
				 		min_stripe_size = BTRFS_STRIPE_LEN;
			
 
				 
			
 
				 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
			
 
				-		if (!device->in_fs_metadata || !device->bdev)
			
 
				+		if (!device->in_fs_metadata || !device->bdev ||
			
 
				+		    device->is_tgtdev_for_dev_replace)
			
 
				 			continue;
			
 
				 
			
 
				 		avail_space = device->total_bytes - device->bytes_used;
			
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
 
				 	if (err)
			
 
				 		goto free_ordered_data;
			
 
				 
			
 
				-	err = btrfs_interface_init();
			
 
				+	err = btrfs_auto_defrag_init();
			
 
				 	if (err)
			
 
				 		goto free_delayed_inode;
			
 
				 
			
 
				+	err = btrfs_interface_init();
			
 
				+	if (err)
			
 
				+		goto free_auto_defrag;
			
 
				+
			
 
				 	err = register_filesystem(&btrfs_fs_type);
			
 
				 	if (err)
			
 
				 		goto unregister_ioctl;
			
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
 
				 
			
 
				 unregister_ioctl:
			
 
				 	btrfs_interface_exit();
			
 
				+free_auto_defrag:
			
 
				+	btrfs_auto_defrag_exit();
			
 
				 free_delayed_inode:
			
 
				 	btrfs_delayed_inode_exit();
			
 
				 free_ordered_data:
			
@@ -1681,6 +1720,7 @@ static int __init init_btrfs_fs(void)
 
				 static void __exit exit_btrfs_fs(void)
			
 
				 {
			
 
				 	btrfs_destroy_cachep();
			
 
				+	btrfs_auto_defrag_exit();
			
 
				 	btrfs_delayed_inode_exit();
			
 
				 	ordered_data_exit();
			
 
				 	extent_map_exit();
			
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
 
				 #include "tree-log.h"
			
 
				 #include "inode-map.h"
			
 
				 #include "volumes.h"
			
 
				+#include "dev-replace.h"
			
 
				 
			
 
				 #define BTRFS_ROOT_TRANS_TAG 0
			
 
				 
			
@@ -145,16 +146,12 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
 
				 	 * the log must never go across transaction boundaries.
			
 
				 	 */
			
 
				 	smp_mb();
			
 
				-	if (!list_empty(&fs_info->tree_mod_seq_list)) {
			
 
				-		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
			
 
				+	if (!list_empty(&fs_info->tree_mod_seq_list))
			
 
				+		WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
			
 
				 			"creating a fresh transaction\n");
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				-	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
			
 
				-		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
			
 
				+	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
			
 
				+		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
			
 
				 			"creating a fresh transaction\n");
			
 
				-		WARN_ON(1);
			
 
				-	}
			
 
				 	atomic_set(&fs_info->tree_mod_seq, 0);
			
 
				 
			
 
				 	spin_lock_init(&cur_trans->commit_lock);
			
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
			
 
				-						    u64 num_items, int type,
			
 
				-						    int noflush)
			
 
				+static struct btrfs_trans_handle *
			
 
				+start_transaction(struct btrfs_root *root, u64 num_items, int type,
			
 
				+		  enum btrfs_reserve_flush_enum flush)
			
 
				 {
			
 
				 	struct btrfs_trans_handle *h;
			
 
				 	struct btrfs_transaction *cur_trans;
			
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 
				 		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
			
 
				 		h = current->journal_info;
			
 
				 		h->use_count++;
			
 
				+		WARN_ON(h->use_count > 2);
			
 
				 		h->orig_rsv = h->block_rsv;
			
 
				 		h->block_rsv = NULL;
			
 
				 		goto got_it;
			
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 
				 		}
			
 
				 
			
 
				 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
			
 
				-		if (noflush)
			
 
				-			ret = btrfs_block_rsv_add_noflush(root,
			
 
				-						&root->fs_info->trans_block_rsv,
			
 
				-						num_bytes);
			
 
				-		else
			
 
				-			ret = btrfs_block_rsv_add(root,
			
 
				-						&root->fs_info->trans_block_rsv,
			
 
				-						num_bytes);
			
 
				+		ret = btrfs_block_rsv_add(root,
			
 
				+					  &root->fs_info->trans_block_rsv,
			
 
				+					  num_bytes, flush);
			
 
				 		if (ret)
			
 
				 			return ERR_PTR(ret);
			
 
				 	}
			
@@ -422,13 +415,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 
				 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
			
 
				 						   int num_items)
			
 
				 {
			
 
				-	return start_transaction(root, num_items, TRANS_START, 0);
			
 
				+	return start_transaction(root, num_items, TRANS_START,
			
 
				+				 BTRFS_RESERVE_FLUSH_ALL);
			
 
				 }
			
 
				 
			
 
				-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
			
 
				+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
			
 
				 					struct btrfs_root *root, int num_items)
			
 
				 {
			
 
				-	return start_transaction(root, num_items, TRANS_START, 1);
			
 
				+	return start_transaction(root, num_items, TRANS_START,
			
 
				+				 BTRFS_RESERVE_FLUSH_LIMIT);
			
 
				 }
			
 
				 
			
 
				 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
			
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
 
				 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
			
 
				 {
			
 
				 	struct btrfs_transaction *cur_trans = NULL, *t;
			
 
				-	int ret;
			
 
				+	int ret = 0;
			
 
				 
			
 
				-	ret = 0;
			
 
				 	if (transid) {
			
 
				 		if (transid <= root->fs_info->last_trans_committed)
			
 
				 			goto out;
			
 
				 
			
 
				+		ret = -EINVAL;
			
 
				 		/* find specified transaction */
			
 
				 		spin_lock(&root->fs_info->trans_lock);
			
 
				 		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			
 
				 			if (t->transid == transid) {
			
 
				 				cur_trans = t;
			
 
				 				atomic_inc(&cur_trans->use_count);
			
 
				+				ret = 0;
			
 
				 				break;
			
 
				 			}
			
 
				-			if (t->transid > transid)
			
 
				+			if (t->transid > transid) {
			
 
				+				ret = 0;
			
 
				 				break;
			
 
				+			}
			
 
				 		}
			
 
				 		spin_unlock(&root->fs_info->trans_lock);
			
 
				-		ret = -EINVAL;
			
 
				+		/* The specified transaction doesn't exist */
			
 
				 		if (!cur_trans)
			
 
				-			goto out;  /* bad transid */
			
 
				+			goto out;
			
 
				 	} else {
			
 
				 		/* find newest transaction that is committing | committed */
			
 
				 		spin_lock(&root->fs_info->trans_lock);
			
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 
				 	}
			
 
				 
			
 
				 	wait_for_commit(root, cur_trans);
			
 
				-
			
 
				 	put_transaction(cur_trans);
			
 
				-	ret = 0;
			
 
				 out:
			
 
				 	return ret;
			
 
				 }
			
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 
				 		return ret;
			
 
				 
			
 
				 	ret = btrfs_run_dev_stats(trans, root->fs_info);
			
 
				-	BUG_ON(ret);
			
 
				+	WARN_ON(ret);
			
 
				+	ret = btrfs_run_dev_replace(trans, root->fs_info);
			
 
				+	WARN_ON(ret);
			
 
				 
			
 
				 	ret = btrfs_run_qgroups(trans, root->fs_info);
			
 
				 	BUG_ON(ret);
			
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 
				 	switch_commit_root(fs_info->extent_root);
			
 
				 	up_write(&fs_info->extent_commit_sem);
			
 
				 
			
 
				+	btrfs_after_dev_replace_commit(fs_info);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
				 	struct btrfs_fs_info *info = root->fs_info;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	int ret;
			
 
				-	unsigned long nr;
			
 
				 
			
 
				 	if (xchg(&root->defrag_running, 1))
			
 
				 		return 0;
			
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
				 
			
 
				 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
			
 
				 
			
 
				-		nr = trans->blocks_used;
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				-		btrfs_btree_balance_dirty(info->tree_root, nr);
			
 
				+		btrfs_btree_balance_dirty(info->tree_root);
			
 
				 		cond_resched();
			
 
				 
			
 
				 		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
			
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
			
 
				 
			
 
				 	if (to_reserve > 0) {
			
 
				-		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
			
 
				-						  to_reserve);
			
 
				+		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
			
 
				+					  to_reserve,
			
 
				+					  BTRFS_RESERVE_NO_FLUSH);
			
 
				 		if (ret) {
			
 
				 			pending->error = ret;
			
 
				 			goto no_free_objectid;
			
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 				    parent_inode, &key,
			
 
				 				    BTRFS_FT_DIR, index);
			
 
				 	/* We have check then name at the beginning, so it is impossible. */
			
 
				-	BUG_ON(ret == -EEXIST);
			
 
				+	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
			
 
				 	if (ret) {
			
 
				 		btrfs_abort_transaction(trans, root, ret);
			
 
				 		goto fail;
			
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
 
				 	 * We've got freeze protection passed with the transaction.
			
 
				 	 * Tell lockdep about it.
			
 
				 	 */
			
 
				-	rwsem_acquire_read(
			
 
				-		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
			
 
				-		0, 1, _THIS_IP_);
			
 
				+	if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
			
 
				+		rwsem_acquire_read(
			
 
				+		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
			
 
				+		     0, 1, _THIS_IP_);
			
 
				 
			
 
				 	current->journal_info = ac->newtrans;
			
 
				 
			
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 
				 	 * Tell lockdep we've released the freeze rwsem, since the
			
 
				 	 * async commit thread will be the one to unlock it.
			
 
				 	 */
			
 
				-	rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
			
 
				-		      1, _THIS_IP_);
			
 
				+	if (trans->type < TRANS_JOIN_NOLOCK)
			
 
				+		rwsem_release(
			
 
				+			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
			
 
				+			1, _THIS_IP_);
			
 
				 
			
 
				 	schedule_delayed_work(&ac->work, 0);
			
 
				 
			
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
				 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
			
 
				 }
			
 
				 
			
 
				+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
			
 
				+					  struct btrfs_root *root)
			
 
				+{
			
 
				+	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
			
 
				+	int snap_pending = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!flush_on_commit) {
			
 
				+		spin_lock(&root->fs_info->trans_lock);
			
 
				+		if (!list_empty(&trans->transaction->pending_snapshots))
			
 
				+			snap_pending = 1;
			
 
				+		spin_unlock(&root->fs_info->trans_lock);
			
 
				+	}
			
 
				+
			
 
				+	if (flush_on_commit || snap_pending) {
			
 
				+		btrfs_start_delalloc_inodes(root, 1);
			
 
				+		btrfs_wait_ordered_extents(root, 1);
			
 
				+	}
			
 
				+
			
 
				+	ret = btrfs_run_delayed_items(trans, root);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * running the delayed items may have added new refs. account
			
 
				+	 * them now so that they hinder processing of more delayed refs
			
 
				+	 * as little as possible.
			
 
				+	 */
			
 
				+	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
			
 
				+
			
 
				+	/*
			
 
				+	 * rename don't use btrfs_join_transaction, so, once we
			
 
				+	 * set the transaction to blocked above, we aren't going
			
 
				+	 * to get any new ordered operations.  We can safely run
			
 
				+	 * it here and no for sure that nothing new will be added
			
 
				+	 * to the list
			
 
				+	 */
			
 
				+	btrfs_run_ordered_operations(root, 1);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * btrfs_transaction state sequence:
			
 
				  *    in_commit = 0, blocked = 0  (initial)
			
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_transaction *cur_trans = trans->transaction;
			
 
				 	struct btrfs_transaction *prev_trans = NULL;
			
 
				 	DEFINE_WAIT(wait);
			
 
				-	int ret = -EIO;
			
 
				+	int ret;
			
 
				 	int should_grow = 0;
			
 
				 	unsigned long now = get_seconds();
			
 
				-	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
			
 
				 
			
 
				-	btrfs_run_ordered_operations(root, 0);
			
 
				+	ret = btrfs_run_ordered_operations(root, 0);
			
 
				+	if (ret) {
			
 
				+		btrfs_abort_transaction(trans, root, ret);
			
 
				+		goto cleanup_transaction;
			
 
				+	}
			
 
				 
			
 
				-	if (cur_trans->aborted)
			
 
				+	if (cur_trans->aborted) {
			
 
				+		ret = cur_trans->aborted;
			
 
				 		goto cleanup_transaction;
			
 
				+	}
			
 
				 
			
 
				 	/* make a pass through all the delayed refs we have so far
			
 
				 	 * any runnings procs may add more while we are here
			
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 		should_grow = 1;
			
 
				 
			
 
				 	do {
			
 
				-		int snap_pending = 0;
			
 
				-
			
 
				 		joined = cur_trans->num_joined;
			
 
				-		if (!list_empty(&trans->transaction->pending_snapshots))
			
 
				-			snap_pending = 1;
			
 
				 
			
 
				 		WARN_ON(cur_trans != trans->transaction);
			
 
				 
			
 
				-		if (flush_on_commit || snap_pending) {
			
 
				-			btrfs_start_delalloc_inodes(root, 1);
			
 
				-			btrfs_wait_ordered_extents(root, 1);
			
 
				-		}
			
 
				-
			
 
				-		ret = btrfs_run_delayed_items(trans, root);
			
 
				+		ret = btrfs_flush_all_pending_stuffs(trans, root);
			
 
				 		if (ret)
			
 
				 			goto cleanup_transaction;
			
 
				 
			
 
				-		/*
			
 
				-		 * running the delayed items may have added new refs. account
			
 
				-		 * them now so that they hinder processing of more delayed refs
			
 
				-		 * as little as possible.
			
 
				-		 */
			
 
				-		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
			
 
				-
			
 
				-		/*
			
 
				-		 * rename don't use btrfs_join_transaction, so, once we
			
 
				-		 * set the transaction to blocked above, we aren't going
			
 
				-		 * to get any new ordered operations.  We can safely run
			
 
				-		 * it here and no for sure that nothing new will be added
			
 
				-		 * to the list
			
 
				-		 */
			
 
				-		btrfs_run_ordered_operations(root, 1);
			
 
				-
			
 
				 		prepare_to_wait(&cur_trans->writer_wait, &wait,
			
 
				 				TASK_UNINTERRUPTIBLE);
			
 
				 
			
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	} while (atomic_read(&cur_trans->num_writers) > 1 ||
			
 
				 		 (should_grow && cur_trans->num_joined != joined));
			
 
				 
			
 
				+	ret = btrfs_flush_all_pending_stuffs(trans, root);
			
 
				+	if (ret)
			
 
				+		goto cleanup_transaction;
			
 
				+
			
 
				 	/*
			
 
				 	 * Ok now we need to make sure to block out any other joins while we
			
 
				 	 * commit the transaction.  We could have started a join before setting
			
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
				 			  struct btrfs_root *root);
			
 
				 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
			
 
				 						   int num_items);
			
 
				-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
			
 
				+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
			
 
				 					struct btrfs_root *root, int num_items);
			
 
				 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
			
 
				 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
			
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
				 			    struct btrfs_inode_item *item,
			
 
				 			    struct inode *inode, int log_inode_only)
			
 
				 {
			
 
				-	btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
			
 
				-	btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
			
 
				-	btrfs_set_inode_mode(leaf, item, inode->i_mode);
			
 
				-	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
			
 
				-
			
 
				-	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
			
 
				-			       inode->i_atime.tv_sec);
			
 
				-	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
			
 
				-				inode->i_atime.tv_nsec);
			
 
				-
			
 
				-	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
			
 
				-			       inode->i_mtime.tv_sec);
			
 
				-	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
			
 
				-				inode->i_mtime.tv_nsec);
			
 
				-
			
 
				-	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
			
 
				-			       inode->i_ctime.tv_sec);
			
 
				-	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
			
 
				-				inode->i_ctime.tv_nsec);
			
 
				-
			
 
				-	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
			
 
				-
			
 
				-	btrfs_set_inode_sequence(leaf, item, inode->i_version);
			
 
				-	btrfs_set_inode_transid(leaf, item, trans->transid);
			
 
				-	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
			
 
				-	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
			
 
				-	btrfs_set_inode_block_group(leaf, item, 0);
			
 
				+	struct btrfs_map_token token;
			
 
				+
			
 
				+	btrfs_init_map_token(&token);
			
 
				 
			
 
				 	if (log_inode_only) {
			
 
				 		/* set the generation to zero so the recover code
			
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
				 		 * just to say 'this inode exists' and a logging
			
 
				 		 * to say 'update this inode with these values'
			
 
				 		 */
			
 
				-		btrfs_set_inode_generation(leaf, item, 0);
			
 
				-		btrfs_set_inode_size(leaf, item, 0);
			
 
				+		btrfs_set_token_inode_generation(leaf, item, 0, &token);
			
 
				+		btrfs_set_token_inode_size(leaf, item, 0, &token);
			
 
				 	} else {
			
 
				-		btrfs_set_inode_generation(leaf, item,
			
 
				-					   BTRFS_I(inode)->generation);
			
 
				-		btrfs_set_inode_size(leaf, item, inode->i_size);
			
 
				-	}
			
 
				+		btrfs_set_token_inode_generation(leaf, item,
			
 
				+						 BTRFS_I(inode)->generation,
			
 
				+						 &token);
			
 
				+		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
			
 
				+	}
			
 
				+
			
 
				+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
			
 
				+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
			
 
				+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
			
 
				+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
			
 
				+
			
 
				+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
			
 
				+				     inode->i_atime.tv_sec, &token);
			
 
				+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
			
 
				+				      inode->i_atime.tv_nsec, &token);
			
 
				+
			
 
				+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
			
 
				+				     inode->i_mtime.tv_sec, &token);
			
 
				+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
			
 
				+				      inode->i_mtime.tv_nsec, &token);
			
 
				+
			
 
				+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
			
 
				+				     inode->i_ctime.tv_sec, &token);
			
 
				+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
			
 
				+				      inode->i_ctime.tv_nsec, &token);
			
 
				+
			
 
				+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
			
 
				+				     &token);
			
 
				+
			
 
				+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
			
 
				+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
			
 
				+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
			
 
				+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
			
 
				+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
			
 
				+}
			
 
				 
			
 
				+static int log_inode_item(struct btrfs_trans_handle *trans,
			
 
				+			  struct btrfs_root *log, struct btrfs_path *path,
			
 
				+			  struct inode *inode)
			
 
				+{
			
 
				+	struct btrfs_inode_item *inode_item;
			
 
				+	struct btrfs_key key;
			
 
				+	int ret;
			
 
				+
			
 
				+	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
			
 
				+	ret = btrfs_insert_empty_item(trans, log, path, &key,
			
 
				+				      sizeof(*inode_item));
			
 
				+	if (ret && ret != -EEXIST)
			
 
				+		return ret;
			
 
				+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
			
 
				+				    struct btrfs_inode_item);
			
 
				+	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
			
 
				+	btrfs_release_path(path);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static noinline int copy_items(struct btrfs_trans_handle *trans,
			
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-struct log_args {
			
 
				-	struct extent_buffer *src;
			
 
				-	u64 next_offset;
			
 
				-	int start_slot;
			
 
				-	int nr;
			
 
				-};
			
 
				+static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
			
 
				+				 struct btrfs_root *root, struct inode *inode,
			
 
				+				 struct extent_map *em,
			
 
				+				 struct btrfs_path *path)
			
 
				+{
			
 
				+	struct btrfs_file_extent_item *fi;
			
 
				+	struct extent_buffer *leaf;
			
 
				+	struct btrfs_key key, new_key;
			
 
				+	struct btrfs_map_token token;
			
 
				+	u64 extent_end;
			
 
				+	u64 extent_offset = 0;
			
 
				+	int extent_type;
			
 
				+	int del_slot = 0;
			
 
				+	int del_nr = 0;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	while (1) {
			
 
				+		btrfs_init_map_token(&token);
			
 
				+		leaf = path->nodes[0];
			
 
				+		path->slots[0]++;
			
 
				+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			
 
				+			if (del_nr) {
			
 
				+				ret = btrfs_del_items(trans, root, path,
			
 
				+						      del_slot, del_nr);
			
 
				+				if (ret)
			
 
				+					return ret;
			
 
				+				del_nr = 0;
			
 
				+			}
			
 
				+
			
 
				+			ret = btrfs_next_leaf_write(trans, root, path, 1);
			
 
				+			if (ret < 0)
			
 
				+				return ret;
			
 
				+			if (ret > 0)
			
 
				+				return 0;
			
 
				+			leaf = path->nodes[0];
			
 
				+		}
			
 
				+
			
 
				+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
			
 
				+		if (key.objectid != btrfs_ino(inode) ||
			
 
				+		    key.type != BTRFS_EXTENT_DATA_KEY ||
			
 
				+		    key.offset >= em->start + em->len)
			
 
				+			break;
			
 
				+
			
 
				+		fi = btrfs_item_ptr(leaf, path->slots[0],
			
 
				+				    struct btrfs_file_extent_item);
			
 
				+		extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
			
 
				+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
			
 
				+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
			
 
				+			extent_offset = btrfs_token_file_extent_offset(leaf,
			
 
				+								fi, &token);
			
 
				+			extent_end = key.offset +
			
 
				+				btrfs_token_file_extent_num_bytes(leaf, fi,
			
 
				+								  &token);
			
 
				+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
			
 
				+			extent_end = key.offset +
			
 
				+				btrfs_file_extent_inline_len(leaf, fi);
			
 
				+		} else {
			
 
				+			BUG();
			
 
				+		}
			
 
				+
			
 
				+		if (extent_end <= em->len + em->start) {
			
 
				+			if (!del_nr) {
			
 
				+				del_slot = path->slots[0];
			
 
				+			}
			
 
				+			del_nr++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Ok so we'll ignore previous items if we log a new extent,
			
 
				+		 * which can lead to overlapping extents, so if we have an
			
 
				+		 * existing extent we want to adjust we _have_ to check the next
			
 
				+		 * guy to make sure we even need this extent anymore, this keeps
			
 
				+		 * us from panicing in set_item_key_safe.
			
 
				+		 */
			
 
				+		if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
			
 
				+			struct btrfs_key tmp_key;
			
 
				+
			
 
				+			btrfs_item_key_to_cpu(leaf, &tmp_key,
			
 
				+					      path->slots[0] + 1);
			
 
				+			if (tmp_key.objectid == btrfs_ino(inode) &&
			
 
				+			    tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
			
 
				+			    tmp_key.offset <= em->start + em->len) {
			
 
				+				if (!del_nr)
			
 
				+					del_slot = path->slots[0];
			
 
				+				del_nr++;
			
 
				+				continue;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
			
 
				+		memcpy(&new_key, &key, sizeof(new_key));
			
 
				+		new_key.offset = em->start + em->len;
			
 
				+		btrfs_set_item_key_safe(trans, root, path, &new_key);
			
 
				+		extent_offset += em->start + em->len - key.offset;
			
 
				+		btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
			
 
				+						   &token);
			
 
				+		btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
			
 
				+						      (em->start + em->len),
			
 
				+						      &token);
			
 
				+		btrfs_mark_buffer_dirty(leaf);
			
 
				+	}
			
 
				+
			
 
				+	if (del_nr)
			
 
				+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				 
			
 
				 static int log_one_extent(struct btrfs_trans_handle *trans,
			
 
				 			  struct inode *inode, struct btrfs_root *root,
			
 
				-			  struct extent_map *em, struct btrfs_path *path,
			
 
				-			  struct btrfs_path *dst_path, struct log_args *args)
			
 
				+			  struct extent_map *em, struct btrfs_path *path)
			
 
				 {
			
 
				 	struct btrfs_root *log = root->log_root;
			
 
				 	struct btrfs_file_extent_item *fi;
			
 
				+	struct extent_buffer *leaf;
			
 
				+	struct list_head ordered_sums;
			
 
				+	struct btrfs_map_token token;
			
 
				 	struct btrfs_key key;
			
 
				-	u64 start = em->mod_start;
			
 
				-	u64 search_start = start;
			
 
				-	u64 len = em->mod_len;
			
 
				-	u64 num_bytes;
			
 
				-	int nritems;
			
 
				+	u64 csum_offset = em->mod_start - em->start;
			
 
				+	u64 csum_len = em->mod_len;
			
 
				+	u64 extent_offset = em->start - em->orig_start;
			
 
				+	u64 block_len;
			
 
				 	int ret;
			
 
				+	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
			
 
				 
			
 
				-	if (BTRFS_I(inode)->logged_trans == trans->transid) {
			
 
				-		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
			
 
				-					   start + len, NULL, 0);
			
 
				-		if (ret)
			
 
				-			return ret;
			
 
				+	INIT_LIST_HEAD(&ordered_sums);
			
 
				+	btrfs_init_map_token(&token);
			
 
				+	key.objectid = btrfs_ino(inode);
			
 
				+	key.type = BTRFS_EXTENT_DATA_KEY;
			
 
				+	key.offset = em->start;
			
 
				+	path->really_keep_locks = 1;
			
 
				+
			
 
				+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
			
 
				+	if (ret && ret != -EEXIST) {
			
 
				+		path->really_keep_locks = 0;
			
 
				+		return ret;
			
 
				 	}
			
 
				+	leaf = path->nodes[0];
			
 
				+	fi = btrfs_item_ptr(leaf, path->slots[0],
			
 
				+			    struct btrfs_file_extent_item);
			
 
				+	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
			
 
				+					       &token);
			
 
				+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
			
 
				+		skip_csum = true;
			
 
				+		btrfs_set_token_file_extent_type(leaf, fi,
			
 
				+						 BTRFS_FILE_EXTENT_PREALLOC,
			
 
				+						 &token);
			
 
				+	} else {
			
 
				+		btrfs_set_token_file_extent_type(leaf, fi,
			
 
				+						 BTRFS_FILE_EXTENT_REG,
			
 
				+						 &token);
			
 
				+		if (em->block_start == 0)
			
 
				+			skip_csum = true;
			
 
				+	}
			
 
				+
			
 
				+	block_len = max(em->block_len, em->orig_block_len);
			
 
				+	if (em->compress_type != BTRFS_COMPRESS_NONE) {
			
 
				+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
			
 
				+							em->block_start,
			
 
				+							&token);
			
 
				+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
			
 
				+							   &token);
			
 
				+	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
			
 
				+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
			
 
				+							em->block_start -
			
 
				+							extent_offset, &token);
			
 
				+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
			
 
				+							   &token);
			
 
				+	} else {
			
 
				+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
			
 
				+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
			
 
				+							   &token);
			
 
				+	}
			
 
				+
			
 
				+	btrfs_set_token_file_extent_offset(leaf, fi,
			
 
				+					   em->start - em->orig_start,
			
 
				+					   &token);
			
 
				+	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
			
 
				+	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
			
 
				+	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
			
 
				+						&token);
			
 
				+	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
			
 
				+	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
			
 
				+	btrfs_mark_buffer_dirty(leaf);
			
 
				 
			
 
				-	while (len) {
			
 
				-		if (args->nr)
			
 
				-			goto next_slot;
			
 
				-again:
			
 
				-		key.objectid = btrfs_ino(inode);
			
 
				-		key.type = BTRFS_EXTENT_DATA_KEY;
			
 
				-		key.offset = search_start;
			
 
				-
			
 
				-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				-		if (ret < 0)
			
 
				-			return ret;
			
 
				-
			
 
				-		if (ret) {
			
 
				-			/*
			
 
				-			 * A rare case were we can have an em for a section of a
			
 
				-			 * larger extent so we need to make sure that this em
			
 
				-			 * falls within the extent we've found.  If not we just
			
 
				-			 * bail and go back to ye-olde way of doing things but
			
 
				-			 * it happens often enough in testing that we need to do
			
 
				-			 * this dance to make sure.
			
 
				-			 */
			
 
				-			do {
			
 
				-				if (path->slots[0] == 0) {
			
 
				-					btrfs_release_path(path);
			
 
				-					if (search_start == 0)
			
 
				-						return -ENOENT;
			
 
				-					search_start--;
			
 
				-					goto again;
			
 
				-				}
			
 
				-
			
 
				-				path->slots[0]--;
			
 
				-				btrfs_item_key_to_cpu(path->nodes[0], &key,
			
 
				-						      path->slots[0]);
			
 
				-				if (key.objectid != btrfs_ino(inode) ||
			
 
				-				    key.type != BTRFS_EXTENT_DATA_KEY) {
			
 
				-					btrfs_release_path(path);
			
 
				-					return -ENOENT;
			
 
				-				}
			
 
				-			} while (key.offset > start);
			
 
				+	/*
			
 
				+	 * Have to check the extent to the right of us to make sure it doesn't
			
 
				+	 * fall in our current range.  We're ok if the previous extent is in our
			
 
				+	 * range since the recovery stuff will run us in key order and thus just
			
 
				+	 * drop the part we overwrote.
			
 
				+	 */
			
 
				+	ret = drop_adjacent_extents(trans, log, inode, em, path);
			
 
				+	btrfs_release_path(path);
			
 
				+	path->really_keep_locks = 0;
			
 
				+	if (ret) {
			
 
				+		return ret;
			
 
				+	}
			
 
				 
			
 
				-			fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
			
 
				-					    struct btrfs_file_extent_item);
			
 
				-			num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
			
 
				-								fi);
			
 
				-			if (key.offset + num_bytes <= start) {
			
 
				-				btrfs_release_path(path);
			
 
				-				return -ENOENT;
			
 
				-			}
			
 
				-		}
			
 
				-		args->src = path->nodes[0];
			
 
				-next_slot:
			
 
				-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
			
 
				-		fi = btrfs_item_ptr(args->src, path->slots[0],
			
 
				-				    struct btrfs_file_extent_item);
			
 
				-		if (args->nr &&
			
 
				-		    args->start_slot + args->nr == path->slots[0]) {
			
 
				-			args->nr++;
			
 
				-		} else if (args->nr) {
			
 
				-			ret = copy_items(trans, inode, dst_path, args->src,
			
 
				-					 args->start_slot, args->nr,
			
 
				-					 LOG_INODE_ALL);
			
 
				-			if (ret)
			
 
				-				return ret;
			
 
				-			args->nr = 1;
			
 
				-			args->start_slot = path->slots[0];
			
 
				-		} else if (!args->nr) {
			
 
				-			args->nr = 1;
			
 
				-			args->start_slot = path->slots[0];
			
 
				-		}
			
 
				-		nritems = btrfs_header_nritems(path->nodes[0]);
			
 
				-		path->slots[0]++;
			
 
				-		num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
			
 
				-		if (len < num_bytes) {
			
 
				-			/* I _think_ this is ok, envision we write to a
			
 
				-			 * preallocated space that is adjacent to a previously
			
 
				-			 * written preallocated space that gets merged when we
			
 
				-			 * mark this preallocated space written.  If we do not
			
 
				-			 * have the adjacent extent in cache then when we copy
			
 
				-			 * this extent it could end up being larger than our EM
			
 
				-			 * thinks it is, which is a-ok, so just set len to 0.
			
 
				-			 */
			
 
				-			len = 0;
			
 
				-		} else {
			
 
				-			len -= num_bytes;
			
 
				-		}
			
 
				-		start = key.offset + num_bytes;
			
 
				-		args->next_offset = start;
			
 
				-		search_start = start;
			
 
				+	if (skip_csum)
			
 
				+		return 0;
			
 
				 
			
 
				-		if (path->slots[0] < nritems) {
			
 
				-			if (len)
			
 
				-				goto next_slot;
			
 
				-			break;
			
 
				-		}
			
 
				+	/* block start is already adjusted for the file extent offset. */
			
 
				+	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
			
 
				+				       em->block_start + csum_offset,
			
 
				+				       em->block_start + csum_offset +
			
 
				+				       csum_len - 1, &ordered_sums, 0);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				-		if (args->nr) {
			
 
				-			ret = copy_items(trans, inode, dst_path, args->src,
			
 
				-					 args->start_slot, args->nr,
			
 
				-					 LOG_INODE_ALL);
			
 
				-			if (ret)
			
 
				-				return ret;
			
 
				-			args->nr = 0;
			
 
				-			btrfs_release_path(path);
			
 
				-		}
			
 
				+	while (!list_empty(&ordered_sums)) {
			
 
				+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
			
 
				+						   struct btrfs_ordered_sum,
			
 
				+						   list);
			
 
				+		if (!ret)
			
 
				+			ret = btrfs_csum_file_blocks(trans, log, sums);
			
 
				+		list_del(&sums->list);
			
 
				+		kfree(sums);
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
			
 
				 				     struct btrfs_root *root,
			
 
				 				     struct inode *inode,
			
 
				-				     struct btrfs_path *path,
			
 
				-				     struct btrfs_path *dst_path)
			
 
				+				     struct btrfs_path *path)
			
 
				 {
			
 
				-	struct log_args args;
			
 
				 	struct extent_map *em, *n;
			
 
				 	struct list_head extents;
			
 
				 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
			
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	INIT_LIST_HEAD(&extents);
			
 
				 
			
 
				-	memset(&args, 0, sizeof(args));
			
 
				-
			
 
				 	write_lock(&tree->lock);
			
 
				 	test_gen = root->fs_info->last_trans_committed;
			
 
				 
			
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
				 
			
 
				 		write_unlock(&tree->lock);
			
 
				 
			
 
				-		/*
			
 
				-		 * If the previous EM and the last extent we left off on aren't
			
 
				-		 * sequential then we need to copy the items we have and redo
			
 
				-		 * our search
			
 
				-		 */
			
 
				-		if (args.nr && em->mod_start != args.next_offset) {
			
 
				-			ret = copy_items(trans, inode, dst_path, args.src,
			
 
				-					 args.start_slot, args.nr,
			
 
				-					 LOG_INODE_ALL);
			
 
				-			if (ret) {
			
 
				-				free_extent_map(em);
			
 
				-				write_lock(&tree->lock);
			
 
				-				continue;
			
 
				-			}
			
 
				-			btrfs_release_path(path);
			
 
				-			args.nr = 0;
			
 
				-		}
			
 
				-
			
 
				-		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
			
 
				+		ret = log_one_extent(trans, inode, root, em, path);
			
 
				 		free_extent_map(em);
			
 
				 		write_lock(&tree->lock);
			
 
				 	}
			
 
				 	WARN_ON(!list_empty(&extents));
			
 
				 	write_unlock(&tree->lock);
			
 
				 
			
 
				-	if (!ret && args.nr)
			
 
				-		ret = copy_items(trans, inode, dst_path, args.src,
			
 
				-				 args.start_slot, args.nr, LOG_INODE_ALL);
			
 
				 	btrfs_release_path(path);
			
 
				 	return ret;
			
 
				 }
			
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 
			
 
				 
			
 
				 	/* today the code can only do partial logging of directories */
			
 
				-	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
			
 
				+	if (S_ISDIR(inode->i_mode) ||
			
 
				+	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			
 
				+		       &BTRFS_I(inode)->runtime_flags) &&
			
 
				+	     inode_only == LOG_INODE_EXISTS))
			
 
				 		max_key.type = BTRFS_XATTR_ITEM_KEY;
			
 
				 	else
			
 
				 		max_key.type = (u8)-1;
			
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 	} else {
			
 
				 		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			
 
				 				       &BTRFS_I(inode)->runtime_flags)) {
			
 
				+			clear_bit(BTRFS_INODE_COPY_EVERYTHING,
			
 
				+				  &BTRFS_I(inode)->runtime_flags);
			
 
				 			ret = btrfs_truncate_inode_items(trans, log,
			
 
				 							 inode, 0, 0);
			
 
				-		} else {
			
 
				-			fast_search = true;
			
 
				+		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
			
 
				+					      &BTRFS_I(inode)->runtime_flags)) {
			
 
				+			if (inode_only == LOG_INODE_ALL)
			
 
				+				fast_search = true;
			
 
				 			max_key.type = BTRFS_XATTR_ITEM_KEY;
			
 
				 			ret = drop_objectid_items(trans, log, path, ino,
			
 
				-						  BTRFS_XATTR_ITEM_KEY);
			
 
				+						  max_key.type);
			
 
				+		} else {
			
 
				+			if (inode_only == LOG_INODE_ALL)
			
 
				+				fast_search = true;
			
 
				+			ret = log_inode_item(trans, log, dst_path, inode);
			
 
				+			if (ret) {
			
 
				+				err = ret;
			
 
				+				goto out_unlock;
			
 
				+			}
			
 
				+			goto log_extents;
			
 
				 		}
			
 
				+
			
 
				 	}
			
 
				 	if (ret) {
			
 
				 		err = ret;
			
@@ -3518,11 +3620,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 		ins_nr = 0;
			
 
				 	}
			
 
				 
			
 
				+log_extents:
			
 
				 	if (fast_search) {
			
 
				-		btrfs_release_path(path);
			
 
				 		btrfs_release_path(dst_path);
			
 
				-		ret = btrfs_log_changed_extents(trans, root, inode, path,
			
 
				-						dst_path);
			
 
				+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
			
 
				 		if (ret) {
			
 
				 			err = ret;
			
 
				 			goto out_unlock;
			
@@ -3531,8 +3632,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
			
 
				 		struct extent_map *em, *n;
			
 
				 
			
 
				+		write_lock(&tree->lock);
			
 
				 		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
			
 
				 			list_del_init(&em->list);
			
 
				+		write_unlock(&tree->lock);
			
 
				 	}
			
 
				 
			
 
				 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
			
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
 
				 #include <linux/capability.h>
			
 
				 #include <linux/ratelimit.h>
			
 
				 #include <linux/kthread.h>
			
 
				-#include <asm/div64.h>
			
 
				 #include "compat.h"
			
 
				 #include "ctree.h"
			
 
				 #include "extent_map.h"
			
@@ -36,6 +35,8 @@
 
				 #include "async-thread.h"
			
 
				 #include "check-integrity.h"
			
 
				 #include "rcu-string.h"
			
 
				+#include "math.h"
			
 
				+#include "dev-replace.h"
			
 
				 
			
 
				 static int init_first_rw_device(struct btrfs_trans_handle *trans,
			
 
				 				struct btrfs_root *root,
			
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 
				 	kfree(fs_devices);
			
 
				 }
			
 
				 
			
 
				+static void btrfs_kobject_uevent(struct block_device *bdev,
			
 
				+				 enum kobject_action action)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
			
 
				+	if (ret)
			
 
				+		pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
			
 
				+			action,
			
 
				+			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			
 
				+			&disk_to_dev(bdev->bd_disk)->kobj);
			
 
				+}
			
 
				+
			
 
				 void btrfs_cleanup_fs_uuids(void)
			
 
				 {
			
 
				 	struct btrfs_fs_devices *fs_devices;
			
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+static int
			
 
				+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
			
 
				+		      int flush, struct block_device **bdev,
			
 
				+		      struct buffer_head **bh)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	*bdev = blkdev_get_by_path(device_path, flags, holder);
			
 
				+
			
 
				+	if (IS_ERR(*bdev)) {
			
 
				+		ret = PTR_ERR(*bdev);
			
 
				+		printk(KERN_INFO "btrfs: open %s failed\n", device_path);
			
 
				+		goto error;
			
 
				+	}
			
 
				+
			
 
				+	if (flush)
			
 
				+		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
			
 
				+	ret = set_blocksize(*bdev, 4096);
			
 
				+	if (ret) {
			
 
				+		blkdev_put(*bdev, flags);
			
 
				+		goto error;
			
 
				+	}
			
 
				+	invalidate_bdev(*bdev);
			
 
				+	*bh = btrfs_read_dev_super(*bdev);
			
 
				+	if (!*bh) {
			
 
				+		ret = -EINVAL;
			
 
				+		blkdev_put(*bdev, flags);
			
 
				+		goto error;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+error:
			
 
				+	*bdev = NULL;
			
 
				+	*bh = NULL;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static void requeue_list(struct btrfs_pending_bios *pending_bios,
			
 
				 			struct bio *head, struct bio *tail)
			
 
				 {
			
@@ -467,7 +519,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 
				 	return ERR_PTR(-ENOMEM);
			
 
				 }
			
 
				 
			
 
				-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
			
 
				+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
			
 
				+			       struct btrfs_fs_devices *fs_devices, int step)
			
 
				 {
			
 
				 	struct btrfs_device *device, *next;
			
 
				 
			
@@ -480,8 +533,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 
				 	/* This is the initialized path, it is safe to release the devices. */
			
 
				 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
			
 
				 		if (device->in_fs_metadata) {
			
 
				-			if (!latest_transid ||
			
 
				-			    device->generation > latest_transid) {
			
 
				+			if (!device->is_tgtdev_for_dev_replace &&
			
 
				+			    (!latest_transid ||
			
 
				+			     device->generation > latest_transid)) {
			
 
				 				latest_devid = device->devid;
			
 
				 				latest_transid = device->generation;
			
 
				 				latest_bdev = device->bdev;
			
@@ -489,6 +543,21 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				+		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			
 
				+			/*
			
 
				+			 * In the first step, keep the device which has
			
 
				+			 * the correct fsid and the devid that is used
			
 
				+			 * for the dev_replace procedure.
			
 
				+			 * In the second step, the dev_replace state is
			
 
				+			 * read from the device tree and it is known
			
 
				+			 * whether the procedure is really active or
			
 
				+			 * not, which means whether this device is
			
 
				+			 * used or whether it should be removed.
			
 
				+			 */
			
 
				+			if (step == 0 || device->is_tgtdev_for_dev_replace) {
			
 
				+				continue;
			
 
				+			}
			
 
				+		}
			
 
				 		if (device->bdev) {
			
 
				 			blkdev_put(device->bdev, device->mode);
			
 
				 			device->bdev = NULL;
			
@@ -497,7 +566,8 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 
				 		if (device->writeable) {
			
 
				 			list_del_init(&device->dev_alloc_list);
			
 
				 			device->writeable = 0;
			
 
				-			fs_devices->rw_devices--;
			
 
				+			if (!device->is_tgtdev_for_dev_replace)
			
 
				+				fs_devices->rw_devices--;
			
 
				 		}
			
 
				 		list_del_init(&device->dev_list);
			
 
				 		fs_devices->num_devices--;
			
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
				 		if (device->bdev)
			
 
				 			fs_devices->open_devices--;
			
 
				 
			
 
				-		if (device->writeable) {
			
 
				+		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
			
 
				 			list_del_init(&device->dev_alloc_list);
			
 
				 			fs_devices->rw_devices--;
			
 
				 		}
			
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
				 		if (!device->name)
			
 
				 			continue;
			
 
				 
			
 
				-		bdev = blkdev_get_by_path(device->name->str, flags, holder);
			
 
				-		if (IS_ERR(bdev)) {
			
 
				-			printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
			
 
				-			goto error;
			
 
				-		}
			
 
				-		filemap_write_and_wait(bdev->bd_inode->i_mapping);
			
 
				-		invalidate_bdev(bdev);
			
 
				-		set_blocksize(bdev, 4096);
			
 
				-
			
 
				-		bh = btrfs_read_dev_super(bdev);
			
 
				-		if (!bh)
			
 
				-			goto error_close;
			
 
				+		ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
			
 
				+					    &bdev, &bh);
			
 
				+		if (ret)
			
 
				+			continue;
			
 
				 
			
 
				 		disk_super = (struct btrfs_super_block *)bh->b_data;
			
 
				 		devid = btrfs_stack_device_id(&disk_super->dev_item);
			
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
				 			fs_devices->rotating = 1;
			
 
				 
			
 
				 		fs_devices->open_devices++;
			
 
				-		if (device->writeable) {
			
 
				+		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
			
 
				 			fs_devices->rw_devices++;
			
 
				 			list_add(&device->dev_alloc_list,
			
 
				 				 &fs_devices->alloc_list);
			
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
				 
			
 
				 error_brelse:
			
 
				 		brelse(bh);
			
 
				-error_close:
			
 
				 		blkdev_put(bdev, flags);
			
 
				-error:
			
 
				 		continue;
			
 
				 	}
			
 
				 	if (fs_devices->open_devices == 0) {
			
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
				 	u64 total_devices;
			
 
				 
			
 
				 	flags |= FMODE_EXCL;
			
 
				-	bdev = blkdev_get_by_path(path, flags, holder);
			
 
				-
			
 
				-	if (IS_ERR(bdev)) {
			
 
				-		ret = PTR_ERR(bdev);
			
 
				-		goto error;
			
 
				-	}
			
 
				-
			
 
				 	mutex_lock(&uuid_mutex);
			
 
				-	ret = set_blocksize(bdev, 4096);
			
 
				+	ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
			
 
				 	if (ret)
			
 
				-		goto error_close;
			
 
				-	bh = btrfs_read_dev_super(bdev);
			
 
				-	if (!bh) {
			
 
				-		ret = -EINVAL;
			
 
				-		goto error_close;
			
 
				-	}
			
 
				+		goto error;
			
 
				 	disk_super = (struct btrfs_super_block *)bh->b_data;
			
 
				 	devid = btrfs_stack_device_id(&disk_super->dev_item);
			
 
				 	transid = btrfs_super_generation(disk_super);
			
 
				 	total_devices = btrfs_super_num_devices(disk_super);
			
 
				-	if (disk_super->label[0])
			
 
				+	if (disk_super->label[0]) {
			
 
				+		if (disk_super->label[BTRFS_LABEL_SIZE - 1])
			
 
				+			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
			
 
				 		printk(KERN_INFO "device label %s ", disk_super->label);
			
 
				-	else
			
 
				+	} else {
			
 
				 		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
			
 
				+	}
			
 
				 	printk(KERN_CONT "devid %llu transid %llu %s\n",
			
 
				 	       (unsigned long long)devid, (unsigned long long)transid, path);
			
 
				 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
			
 
				 	if (!ret && fs_devices_ret)
			
 
				 		(*fs_devices_ret)->total_devices = total_devices;
			
 
				 	brelse(bh);
			
 
				-error_close:
			
 
				-	mutex_unlock(&uuid_mutex);
			
 
				 	blkdev_put(bdev, flags);
			
 
				 error:
			
 
				+	mutex_unlock(&uuid_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 
				 
			
 
				 	*length = 0;
			
 
				 
			
 
				-	if (start >= device->total_bytes)
			
 
				+	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
			
 
				 		return 0;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 
				 	max_hole_size = 0;
			
 
				 	hole_size = 0;
			
 
				 
			
 
				-	if (search_start >= search_end) {
			
 
				+	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
			
 
				 		ret = -ENOSPC;
			
 
				 		goto error;
			
 
				 	}
			
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_key key;
			
 
				 
			
 
				 	WARN_ON(!device->in_fs_metadata);
			
 
				+	WARN_ON(device->is_tgtdev_for_dev_replace);
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 		root->fs_info->avail_system_alloc_bits |
			
 
				 		root->fs_info->avail_metadata_alloc_bits;
			
 
				 
			
 
				-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
			
 
				-	    root->fs_info->fs_devices->num_devices <= 4) {
			
 
				+	num_devices = root->fs_info->fs_devices->num_devices;
			
 
				+	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
			
 
				+	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
			
 
				+		WARN_ON(num_devices < 1);
			
 
				+		num_devices--;
			
 
				+	}
			
 
				+	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
			
 
				+
			
 
				+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
			
 
				 		printk(KERN_ERR "btrfs: unable to go below four devices "
			
 
				 		       "on raid10\n");
			
 
				 		ret = -EINVAL;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
			
 
				-	    root->fs_info->fs_devices->num_devices <= 2) {
			
 
				+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
			
 
				 		printk(KERN_ERR "btrfs: unable to go below two "
			
 
				 		       "devices on raid1\n");
			
 
				 		ret = -EINVAL;
			
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 		 * is held.
			
 
				 		 */
			
 
				 		list_for_each_entry(tmp, devices, dev_list) {
			
 
				-			if (tmp->in_fs_metadata && !tmp->bdev) {
			
 
				+			if (tmp->in_fs_metadata &&
			
 
				+			    !tmp->is_tgtdev_for_dev_replace &&
			
 
				+			    !tmp->bdev) {
			
 
				 				device = tmp;
			
 
				 				break;
			
 
				 			}
			
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 			goto out;
			
 
				 		}
			
 
				 	} else {
			
 
				-		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
			
 
				-					  root->fs_info->bdev_holder);
			
 
				-		if (IS_ERR(bdev)) {
			
 
				-			ret = PTR_ERR(bdev);
			
 
				+		ret = btrfs_get_bdev_and_sb(device_path,
			
 
				+					    FMODE_READ | FMODE_EXCL,
			
 
				+					    root->fs_info->bdev_holder, 0,
			
 
				+					    &bdev, &bh);
			
 
				+		if (ret)
			
 
				 			goto out;
			
 
				-		}
			
 
				-
			
 
				-		set_blocksize(bdev, 4096);
			
 
				-		invalidate_bdev(bdev);
			
 
				-		bh = btrfs_read_dev_super(bdev);
			
 
				-		if (!bh) {
			
 
				-			ret = -EINVAL;
			
 
				-			goto error_close;
			
 
				-		}
			
 
				 		disk_super = (struct btrfs_super_block *)bh->b_data;
			
 
				 		devid = btrfs_stack_device_id(&disk_super->dev_item);
			
 
				 		dev_uuid = disk_super->dev_item.uuid;
			
 
				-		device = btrfs_find_device(root, devid, dev_uuid,
			
 
				+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
			
 
				 					   disk_super->fsid);
			
 
				 		if (!device) {
			
 
				 			ret = -ENOENT;
			
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (device->is_tgtdev_for_dev_replace) {
			
 
				+		pr_err("btrfs: unable to remove the dev_replace target dev\n");
			
 
				+		ret = -EINVAL;
			
 
				+		goto error_brelse;
			
 
				+	}
			
 
				+
			
 
				 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
			
 
				 		printk(KERN_ERR "btrfs: unable to remove the only writeable "
			
 
				 		       "device\n");
			
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	if (ret)
			
 
				 		goto error_undo;
			
 
				 
			
 
				+	/*
			
 
				+	 * TODO: the superblock still includes this device in its num_devices
			
 
				+	 * counter although write_all_supers() is not locked out. This
			
 
				+	 * could give a filesystem state which requires a degraded mount.
			
 
				+	 */
			
 
				 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
			
 
				 	if (ret)
			
 
				 		goto error_undo;
			
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	spin_unlock(&root->fs_info->free_chunk_lock);
			
 
				 
			
 
				 	device->in_fs_metadata = 0;
			
 
				-	btrfs_scrub_cancel_dev(root, device);
			
 
				+	btrfs_scrub_cancel_dev(root->fs_info, device);
			
 
				 
			
 
				 	/*
			
 
				 	 * the device list mutex makes sure that we don't change
			
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	 * at this point, the device is zero sized.  We want to
			
 
				 	 * remove it from the devices list and zero out the old super
			
 
				 	 */
			
 
				-	if (clear_super) {
			
 
				+	if (clear_super && disk_super) {
			
 
				 		/* make sure this device isn't detected as part of
			
 
				 		 * the FS anymore
			
 
				 		 */
			
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 
			
 
				 	ret = 0;
			
 
				 
			
 
				+	/* Notify udev that device has changed */
			
 
				+	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
			
 
				+
			
 
				 error_brelse:
			
 
				 	brelse(bh);
			
 
				-error_close:
			
 
				 	if (bdev)
			
 
				 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
			
 
				 out:
			
@@ -1512,6 +1576,112 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	goto error_brelse;
			
 
				 }
			
 
				 
			
 
				+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
			
 
				+				 struct btrfs_device *srcdev)
			
 
				+{
			
 
				+	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
			
 
				+	list_del_rcu(&srcdev->dev_list);
			
 
				+	list_del_rcu(&srcdev->dev_alloc_list);
			
 
				+	fs_info->fs_devices->num_devices--;
			
 
				+	if (srcdev->missing) {
			
 
				+		fs_info->fs_devices->missing_devices--;
			
 
				+		fs_info->fs_devices->rw_devices++;
			
 
				+	}
			
 
				+	if (srcdev->can_discard)
			
 
				+		fs_info->fs_devices->num_can_discard--;
			
 
				+	if (srcdev->bdev)
			
 
				+		fs_info->fs_devices->open_devices--;
			
 
				+
			
 
				+	call_rcu(&srcdev->rcu, free_device);
			
 
				+}
			
 
				+
			
 
				+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
			
 
				+				      struct btrfs_device *tgtdev)
			
 
				+{
			
 
				+	struct btrfs_device *next_device;
			
 
				+
			
 
				+	WARN_ON(!tgtdev);
			
 
				+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
			
 
				+	if (tgtdev->bdev) {
			
 
				+		btrfs_scratch_superblock(tgtdev);
			
 
				+		fs_info->fs_devices->open_devices--;
			
 
				+	}
			
 
				+	fs_info->fs_devices->num_devices--;
			
 
				+	if (tgtdev->can_discard)
			
 
				+		fs_info->fs_devices->num_can_discard++;
			
 
				+
			
 
				+	next_device = list_entry(fs_info->fs_devices->devices.next,
			
 
				+				 struct btrfs_device, dev_list);
			
 
				+	if (tgtdev->bdev == fs_info->sb->s_bdev)
			
 
				+		fs_info->sb->s_bdev = next_device->bdev;
			
 
				+	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
			
 
				+		fs_info->fs_devices->latest_bdev = next_device->bdev;
			
 
				+	list_del_rcu(&tgtdev->dev_list);
			
 
				+
			
 
				+	call_rcu(&tgtdev->rcu, free_device);
			
 
				+
			
 
				+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				+}
			
 
				+
			
 
				+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
			
 
				+			      struct btrfs_device **device)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	struct btrfs_super_block *disk_super;
			
 
				+	u64 devid;
			
 
				+	u8 *dev_uuid;
			
 
				+	struct block_device *bdev;
			
 
				+	struct buffer_head *bh;
			
 
				+
			
 
				+	*device = NULL;
			
 
				+	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
			
 
				+				    root->fs_info->bdev_holder, 0, &bdev, &bh);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	disk_super = (struct btrfs_super_block *)bh->b_data;
			
 
				+	devid = btrfs_stack_device_id(&disk_super->dev_item);
			
 
				+	dev_uuid = disk_super->dev_item.uuid;
			
 
				+	*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
			
 
				+				    disk_super->fsid);
			
 
				+	brelse(bh);
			
 
				+	if (!*device)
			
 
				+		ret = -ENOENT;
			
 
				+	blkdev_put(bdev, FMODE_READ);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
			
 
				+					 char *device_path,
			
 
				+					 struct btrfs_device **device)
			
 
				+{
			
 
				+	*device = NULL;
			
 
				+	if (strcmp(device_path, "missing") == 0) {
			
 
				+		struct list_head *devices;
			
 
				+		struct btrfs_device *tmp;
			
 
				+
			
 
				+		devices = &root->fs_info->fs_devices->devices;
			
 
				+		/*
			
 
				+		 * It is safe to read the devices since the volume_mutex
			
 
				+		 * is held by the caller.
			
 
				+		 */
			
 
				+		list_for_each_entry(tmp, devices, dev_list) {
			
 
				+			if (tmp->in_fs_metadata && !tmp->bdev) {
			
 
				+				*device = tmp;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (!*device) {
			
 
				+			pr_err("btrfs: no missing device found\n");
			
 
				+			return -ENOENT;
			
 
				+		}
			
 
				+
			
 
				+		return 0;
			
 
				+	} else {
			
 
				+		return btrfs_find_device_by_path(root, device_path, device);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * does all the dirty work required for changing file system's UUID.
			
 
				  */
			
@@ -1630,7 +1800,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
 
				 		read_extent_buffer(leaf, fs_uuid,
			
 
				 				   (unsigned long)btrfs_device_fsid(dev_item),
			
 
				 				   BTRFS_UUID_SIZE);
			
 
				-		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
			
 
				+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
			
 
				+					   fs_uuid);
			
 
				 		BUG_ON(!device); /* Logic error */
			
 
				 
			
 
				 		if (device->fs_devices->seeding) {
			
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
				 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
			
 
				 
			
 
				 	devices = &root->fs_info->fs_devices->devices;
			
 
				-	/*
			
 
				-	 * we have the volume lock, so we don't need the extra
			
 
				-	 * device list mutex while reading the list here.
			
 
				-	 */
			
 
				+
			
 
				+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 	list_for_each_entry(device, devices, dev_list) {
			
 
				 		if (device->bdev == bdev) {
			
 
				 			ret = -EEXIST;
			
 
				+			mutex_unlock(
			
 
				+				&root->fs_info->fs_devices->device_list_mutex);
			
 
				 			goto error;
			
 
				 		}
			
 
				 	}
			
 
				+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				 	device = kzalloc(sizeof(*device), GFP_NOFS);
			
 
				 	if (!device) {
			
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
				 	device->dev_root = root->fs_info->dev_root;
			
 
				 	device->bdev = bdev;
			
 
				 	device->in_fs_metadata = 1;
			
 
				+	device->is_tgtdev_for_dev_replace = 0;
			
 
				 	device->mode = FMODE_EXCL;
			
 
				 	set_blocksize(device->bdev, 4096);
			
 
				 
			
@@ -1844,6 +2017,98 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
			
 
				+				  struct btrfs_device **device_out)
			
 
				+{
			
 
				+	struct request_queue *q;
			
 
				+	struct btrfs_device *device;
			
 
				+	struct block_device *bdev;
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct list_head *devices;
			
 
				+	struct rcu_string *name;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	*device_out = NULL;
			
 
				+	if (fs_info->fs_devices->seeding)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
			
 
				+				  fs_info->bdev_holder);
			
 
				+	if (IS_ERR(bdev))
			
 
				+		return PTR_ERR(bdev);
			
 
				+
			
 
				+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
			
 
				+
			
 
				+	devices = &fs_info->fs_devices->devices;
			
 
				+	list_for_each_entry(device, devices, dev_list) {
			
 
				+		if (device->bdev == bdev) {
			
 
				+			ret = -EEXIST;
			
 
				+			goto error;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	device = kzalloc(sizeof(*device), GFP_NOFS);
			
 
				+	if (!device) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto error;
			
 
				+	}
			
 
				+
			
 
				+	name = rcu_string_strdup(device_path, GFP_NOFS);
			
 
				+	if (!name) {
			
 
				+		kfree(device);
			
 
				+		ret = -ENOMEM;
			
 
				+		goto error;
			
 
				+	}
			
 
				+	rcu_assign_pointer(device->name, name);
			
 
				+
			
 
				+	q = bdev_get_queue(bdev);
			
 
				+	if (blk_queue_discard(q))
			
 
				+		device->can_discard = 1;
			
 
				+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				+	device->writeable = 1;
			
 
				+	device->work.func = pending_bios_fn;
			
 
				+	generate_random_uuid(device->uuid);
			
 
				+	device->devid = BTRFS_DEV_REPLACE_DEVID;
			
 
				+	spin_lock_init(&device->io_lock);
			
 
				+	device->generation = 0;
			
 
				+	device->io_width = root->sectorsize;
			
 
				+	device->io_align = root->sectorsize;
			
 
				+	device->sector_size = root->sectorsize;
			
 
				+	device->total_bytes = i_size_read(bdev->bd_inode);
			
 
				+	device->disk_total_bytes = device->total_bytes;
			
 
				+	device->dev_root = fs_info->dev_root;
			
 
				+	device->bdev = bdev;
			
 
				+	device->in_fs_metadata = 1;
			
 
				+	device->is_tgtdev_for_dev_replace = 1;
			
 
				+	device->mode = FMODE_EXCL;
			
 
				+	set_blocksize(device->bdev, 4096);
			
 
				+	device->fs_devices = fs_info->fs_devices;
			
 
				+	list_add(&device->dev_list, &fs_info->fs_devices->devices);
			
 
				+	fs_info->fs_devices->num_devices++;
			
 
				+	fs_info->fs_devices->open_devices++;
			
 
				+	if (device->can_discard)
			
 
				+		fs_info->fs_devices->num_can_discard++;
			
 
				+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				+
			
 
				+	*device_out = device;
			
 
				+	return ret;
			
 
				+
			
 
				+error:
			
 
				+	blkdev_put(bdev, FMODE_EXCL);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
			
 
				+					      struct btrfs_device *tgtdev)
			
 
				+{
			
 
				+	WARN_ON(fs_info->fs_devices->rw_devices == 0);
			
 
				+	tgtdev->io_width = fs_info->dev_root->sectorsize;
			
 
				+	tgtdev->io_align = fs_info->dev_root->sectorsize;
			
 
				+	tgtdev->sector_size = fs_info->dev_root->sectorsize;
			
 
				+	tgtdev->dev_root = fs_info->dev_root;
			
 
				+	tgtdev->in_fs_metadata = 1;
			
 
				+}
			
 
				+
			
 
				 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
			
 
				 					struct btrfs_device *device)
			
 
				 {
			
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	if (!device->writeable)
			
 
				 		return -EACCES;
			
 
				-	if (new_size <= device->total_bytes)
			
 
				+	if (new_size <= device->total_bytes ||
			
 
				+	    device->is_tgtdev_for_dev_replace)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
			
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static u64 div_factor_fine(u64 num, int factor)
			
 
				-{
			
 
				-	if (factor <= 0)
			
 
				-		return 0;
			
 
				-	if (factor >= 100)
			
 
				-		return num;
			
 
				-
			
 
				-	num *= factor;
			
 
				-	do_div(num, 100);
			
 
				-	return num;
			
 
				-}
			
 
				-
			
 
				 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
			
 
				 			      struct btrfs_balance_args *bargs)
			
 
				 {
			
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static u64 div_factor(u64 num, int factor)
			
 
				-{
			
 
				-	if (factor == 10)
			
 
				-		return num;
			
 
				-	num *= factor;
			
 
				-	do_div(num, 10);
			
 
				-	return num;
			
 
				-}
			
 
				-
			
 
				 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
			
 
				 {
			
 
				 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
			
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 
				 		size_to_free = div_factor(old_size, 1);
			
 
				 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
			
 
				 		if (!device->writeable ||
			
 
				-		    device->total_bytes - device->bytes_used > size_to_free)
			
 
				+		    device->total_bytes - device->bytes_used > size_to_free ||
			
 
				+		    device->is_tgtdev_for_dev_replace)
			
 
				 			continue;
			
 
				 
			
 
				 		ret = btrfs_shrink_device(device, old_size - size_to_free);
			
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
				 	u64 allowed;
			
 
				 	int mixed = 0;
			
 
				 	int ret;
			
 
				+	u64 num_devices;
			
 
				 
			
 
				 	if (btrfs_fs_closing(fs_info) ||
			
 
				 	    atomic_read(&fs_info->balance_pause_req) ||
			
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	num_devices = fs_info->fs_devices->num_devices;
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
			
 
				+		BUG_ON(num_devices < 1);
			
 
				+		num_devices--;
			
 
				+	}
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
			
 
				-	if (fs_info->fs_devices->num_devices == 1)
			
 
				+	if (num_devices == 1)
			
 
				 		allowed |= BTRFS_BLOCK_GROUP_DUP;
			
 
				-	else if (fs_info->fs_devices->num_devices < 4)
			
 
				+	else if (num_devices < 4)
			
 
				 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
			
 
				 	else
			
 
				 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
			
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
 
				 		ret = btrfs_balance(fs_info->balance_ctl, NULL);
			
 
				 	}
			
 
				 
			
 
				+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
			
 
				 	mutex_unlock(&fs_info->balance_mutex);
			
 
				 	mutex_unlock(&fs_info->volume_mutex);
			
 
				 
			
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				+	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
			
 
				 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
			
 
				 	if (IS_ERR(tsk))
			
 
				 		return PTR_ERR(tsk);
			
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 	u64 old_size = device->total_bytes;
			
 
				 	u64 diff = device->total_bytes - new_size;
			
 
				 
			
 
				-	if (new_size >= device->total_bytes)
			
 
				+	if (device->is_tgtdev_for_dev_replace)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
			
 
				+	{ 2, 1, 0, 4, 2, 2 /* raid10 */ },
			
 
				+	{ 1, 1, 2, 2, 2, 2 /* raid1 */ },
			
 
				+	{ 1, 2, 1, 1, 1, 2 /* dup */ },
			
 
				+	{ 1, 1, 0, 2, 1, 1 /* raid0 */ },
			
 
				+	{ 1, 1, 0, 1, 1, 1 /* single */ },
			
 
				+};
			
 
				+
			
 
				 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
			
 
				 			       struct btrfs_root *extent_root,
			
 
				 			       struct map_lookup **map_ret,
			
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
				 	int ndevs;
			
 
				 	int i;
			
 
				 	int j;
			
 
				+	int index;
			
 
				 
			
 
				 	BUG_ON(!alloc_profile_is_valid(type, 0));
			
 
				 
			
 
				 	if (list_empty(&fs_devices->alloc_list))
			
 
				 		return -ENOSPC;
			
 
				 
			
 
				-	sub_stripes = 1;
			
 
				-	dev_stripes = 1;
			
 
				-	devs_increment = 1;
			
 
				-	ncopies = 1;
			
 
				-	devs_max = 0;	/* 0 == as many as possible */
			
 
				-	devs_min = 1;
			
 
				+	index = __get_raid_index(type);
			
 
				 
			
 
				-	/*
			
 
				-	 * define the properties of each RAID type.
			
 
				-	 * FIXME: move this to a global table and use it in all RAID
			
 
				-	 * calculation code
			
 
				-	 */
			
 
				-	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
			
 
				-		dev_stripes = 2;
			
 
				-		ncopies = 2;
			
 
				-		devs_max = 1;
			
 
				-	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
			
 
				-		devs_min = 2;
			
 
				-	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
			
 
				-		devs_increment = 2;
			
 
				-		ncopies = 2;
			
 
				-		devs_max = 2;
			
 
				-		devs_min = 2;
			
 
				-	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
			
 
				-		sub_stripes = 2;
			
 
				-		devs_increment = 2;
			
 
				-		ncopies = 2;
			
 
				-		devs_min = 4;
			
 
				-	} else {
			
 
				-		devs_max = 1;
			
 
				-	}
			
 
				+	sub_stripes = btrfs_raid_array[index].sub_stripes;
			
 
				+	dev_stripes = btrfs_raid_array[index].dev_stripes;
			
 
				+	devs_max = btrfs_raid_array[index].devs_max;
			
 
				+	devs_min = btrfs_raid_array[index].devs_min;
			
 
				+	devs_increment = btrfs_raid_array[index].devs_increment;
			
 
				+	ncopies = btrfs_raid_array[index].ncopies;
			
 
				 
			
 
				 	if (type & BTRFS_BLOCK_GROUP_DATA) {
			
 
				 		max_stripe_size = 1024 * 1024 * 1024;
			
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
				 		cur = cur->next;
			
 
				 
			
 
				 		if (!device->writeable) {
			
 
				-			printk(KERN_ERR
			
 
				+			WARN(1, KERN_ERR
			
 
				 			       "btrfs: read-only device in alloc_list\n");
			
 
				-			WARN_ON(1);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		if (!device->in_fs_metadata)
			
 
				+		if (!device->in_fs_metadata ||
			
 
				+		    device->is_tgtdev_for_dev_replace)
			
 
				 			continue;
			
 
				 
			
 
				 		if (device->total_bytes > device->bytes_used)
			
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
				 		devices_info[ndevs].total_avail = total_avail;
			
 
				 		devices_info[ndevs].dev = device;
			
 
				 		++ndevs;
			
 
				+		WARN_ON(ndevs > fs_devices->rw_devices);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
			
 
				+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
			
 
				 {
			
 
				+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
			
 
				 	struct extent_map *em;
			
 
				 	struct map_lookup *map;
			
 
				 	struct extent_map_tree *em_tree = &map_tree->map_tree;
			
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 
				 	else
			
 
				 		ret = 1;
			
 
				 	free_extent_map(em);
			
 
				+
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
			
 
				+		ret++;
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int find_live_mirror(struct map_lookup *map, int first, int num,
			
 
				-			    int optimal)
			
 
				+static int find_live_mirror(struct btrfs_fs_info *fs_info,
			
 
				+			    struct map_lookup *map, int first, int num,
			
 
				+			    int optimal, int dev_replace_is_ongoing)
			
 
				 {
			
 
				 	int i;
			
 
				-	if (map->stripes[optimal].dev->bdev)
			
 
				-		return optimal;
			
 
				-	for (i = first; i < first + num; i++) {
			
 
				-		if (map->stripes[i].dev->bdev)
			
 
				-			return i;
			
 
				+	int tolerance;
			
 
				+	struct btrfs_device *srcdev;
			
 
				+
			
 
				+	if (dev_replace_is_ongoing &&
			
 
				+	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
			
 
				+	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
			
 
				+		srcdev = fs_info->dev_replace.srcdev;
			
 
				+	else
			
 
				+		srcdev = NULL;
			
 
				+
			
 
				+	/*
			
 
				+	 * try to avoid the drive that is the source drive for a
			
 
				+	 * dev-replace procedure, only choose it if no other non-missing
			
 
				+	 * mirror is available
			
 
				+	 */
			
 
				+	for (tolerance = 0; tolerance < 2; tolerance++) {
			
 
				+		if (map->stripes[optimal].dev->bdev &&
			
 
				+		    (tolerance || map->stripes[optimal].dev != srcdev))
			
 
				+			return optimal;
			
 
				+		for (i = first; i < first + num; i++) {
			
 
				+			if (map->stripes[i].dev->bdev &&
			
 
				+			    (tolerance || map->stripes[i].dev != srcdev))
			
 
				+				return i;
			
 
				+		}
			
 
				 	}
			
 
				+
			
 
				 	/* we couldn't find one that doesn't fail.  Just return something
			
 
				 	 * and the io error handling code will clean up eventually
			
 
				 	 */
			
 
				 	return optimal;
			
 
				 }
			
 
				 
			
 
				-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
			
 
				+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
			
 
				 			     u64 logical, u64 *length,
			
 
				 			     struct btrfs_bio **bbio_ret,
			
 
				 			     int mirror_num)
			
 
				 {
			
 
				 	struct extent_map *em;
			
 
				 	struct map_lookup *map;
			
 
				+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
			
 
				 	struct extent_map_tree *em_tree = &map_tree->map_tree;
			
 
				 	u64 offset;
			
 
				 	u64 stripe_offset;
			
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 	int num_stripes;
			
 
				 	int max_errors = 0;
			
 
				 	struct btrfs_bio *bbio = NULL;
			
 
				+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				+	int dev_replace_is_ongoing = 0;
			
 
				+	int num_alloc_stripes;
			
 
				+	int patch_the_first_stripe_for_dev_replace = 0;
			
 
				+	u64 physical_to_patch_in_first_stripe = 0;
			
 
				 
			
 
				 	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, logical, *length);
			
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 	map = (struct map_lookup *)em->bdev;
			
 
				 	offset = logical - em->start;
			
 
				 
			
 
				-	if (mirror_num > map->num_stripes)
			
 
				-		mirror_num = 0;
			
 
				-
			
 
				 	stripe_nr = offset;
			
 
				 	/*
			
 
				 	 * stripe_nr counts the total number of stripes we have to stride
			
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 	if (!bbio_ret)
			
 
				 		goto out;
			
 
				 
			
 
				+	btrfs_dev_replace_lock(dev_replace);
			
 
				+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
			
 
				+	if (!dev_replace_is_ongoing)
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				+
			
 
				+	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
			
 
				+	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
			
 
				+	    dev_replace->tgtdev != NULL) {
			
 
				+		/*
			
 
				+		 * in dev-replace case, for repair case (that's the only
			
 
				+		 * case where the mirror is selected explicitly when
			
 
				+		 * calling btrfs_map_block), blocks left of the left cursor
			
 
				+		 * can also be read from the target drive.
			
 
				+		 * For REQ_GET_READ_MIRRORS, the target drive is added as
			
 
				+		 * the last one to the array of stripes. For READ, it also
			
 
				+		 * needs to be supported using the same mirror number.
			
 
				+		 * If the requested block is not left of the left cursor,
			
 
				+		 * EIO is returned. This can happen because btrfs_num_copies()
			
 
				+		 * returns one more in the dev-replace case.
			
 
				+		 */
			
 
				+		u64 tmp_length = *length;
			
 
				+		struct btrfs_bio *tmp_bbio = NULL;
			
 
				+		int tmp_num_stripes;
			
 
				+		u64 srcdev_devid = dev_replace->srcdev->devid;
			
 
				+		int index_srcdev = 0;
			
 
				+		int found = 0;
			
 
				+		u64 physical_of_found = 0;
			
 
				+
			
 
				+		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
			
 
				+			     logical, &tmp_length, &tmp_bbio, 0);
			
 
				+		if (ret) {
			
 
				+			WARN_ON(tmp_bbio != NULL);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		tmp_num_stripes = tmp_bbio->num_stripes;
			
 
				+		if (mirror_num > tmp_num_stripes) {
			
 
				+			/*
			
 
				+			 * REQ_GET_READ_MIRRORS does not contain this
			
 
				+			 * mirror, that means that the requested area
			
 
				+			 * is not left of the left cursor
			
 
				+			 */
			
 
				+			ret = -EIO;
			
 
				+			kfree(tmp_bbio);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * process the rest of the function using the mirror_num
			
 
				+		 * of the source drive. Therefore look it up first.
			
 
				+		 * At the end, patch the device pointer to the one of the
			
 
				+		 * target drive.
			
 
				+		 */
			
 
				+		for (i = 0; i < tmp_num_stripes; i++) {
			
 
				+			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
			
 
				+				/*
			
 
				+				 * In case of DUP, in order to keep it
			
 
				+				 * simple, only add the mirror with the
			
 
				+				 * lowest physical address
			
 
				+				 */
			
 
				+				if (found &&
			
 
				+				    physical_of_found <=
			
 
				+				     tmp_bbio->stripes[i].physical)
			
 
				+					continue;
			
 
				+				index_srcdev = i;
			
 
				+				found = 1;
			
 
				+				physical_of_found =
			
 
				+					tmp_bbio->stripes[i].physical;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (found) {
			
 
				+			mirror_num = index_srcdev + 1;
			
 
				+			patch_the_first_stripe_for_dev_replace = 1;
			
 
				+			physical_to_patch_in_first_stripe = physical_of_found;
			
 
				+		} else {
			
 
				+			WARN_ON(1);
			
 
				+			ret = -EIO;
			
 
				+			kfree(tmp_bbio);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		kfree(tmp_bbio);
			
 
				+	} else if (mirror_num > map->num_stripes) {
			
 
				+		mirror_num = 0;
			
 
				+	}
			
 
				+
			
 
				 	num_stripes = 1;
			
 
				 	stripe_index = 0;
			
 
				 	stripe_nr_orig = stripe_nr;
			
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 					    stripe_nr_end - stripe_nr_orig);
			
 
				 		stripe_index = do_div(stripe_nr, map->num_stripes);
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
			
 
				-		if (rw & (REQ_WRITE | REQ_DISCARD))
			
 
				+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
			
 
				 			num_stripes = map->num_stripes;
			
 
				 		else if (mirror_num)
			
 
				 			stripe_index = mirror_num - 1;
			
 
				 		else {
			
 
				-			stripe_index = find_live_mirror(map, 0,
			
 
				+			stripe_index = find_live_mirror(fs_info, map, 0,
			
 
				 					    map->num_stripes,
			
 
				-					    current->pid % map->num_stripes);
			
 
				+					    current->pid % map->num_stripes,
			
 
				+					    dev_replace_is_ongoing);
			
 
				 			mirror_num = stripe_index + 1;
			
 
				 		}
			
 
				 
			
 
				 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
			
 
				-		if (rw & (REQ_WRITE | REQ_DISCARD)) {
			
 
				+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
			
 
				 			num_stripes = map->num_stripes;
			
 
				 		} else if (mirror_num) {
			
 
				 			stripe_index = mirror_num - 1;
			
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 		stripe_index = do_div(stripe_nr, factor);
			
 
				 		stripe_index *= map->sub_stripes;
			
 
				 
			
 
				-		if (rw & REQ_WRITE)
			
 
				+		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
			
 
				 			num_stripes = map->sub_stripes;
			
 
				 		else if (rw & REQ_DISCARD)
			
 
				 			num_stripes = min_t(u64, map->sub_stripes *
			
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 			stripe_index += mirror_num - 1;
			
 
				 		else {
			
 
				 			int old_stripe_index = stripe_index;
			
 
				-			stripe_index = find_live_mirror(map, stripe_index,
			
 
				+			stripe_index = find_live_mirror(fs_info, map,
			
 
				+					      stripe_index,
			
 
				 					      map->sub_stripes, stripe_index +
			
 
				-					      current->pid % map->sub_stripes);
			
 
				+					      current->pid % map->sub_stripes,
			
 
				+					      dev_replace_is_ongoing);
			
 
				 			mirror_num = stripe_index - old_stripe_index + 1;
			
 
				 		}
			
 
				 	} else {
			
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 	}
			
 
				 	BUG_ON(stripe_index >= map->num_stripes);
			
 
				 
			
 
				-	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
			
 
				+	num_alloc_stripes = num_stripes;
			
 
				+	if (dev_replace_is_ongoing) {
			
 
				+		if (rw & (REQ_WRITE | REQ_DISCARD))
			
 
				+			num_alloc_stripes <<= 1;
			
 
				+		if (rw & REQ_GET_READ_MIRRORS)
			
 
				+			num_alloc_stripes++;
			
 
				+	}
			
 
				+	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
			
 
				 	if (!bbio) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (rw & REQ_WRITE) {
			
 
				+	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
			
 
				 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			
 
				 				 BTRFS_BLOCK_GROUP_RAID10 |
			
 
				 				 BTRFS_BLOCK_GROUP_DUP)) {
			
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
			
 
				+	    dev_replace->tgtdev != NULL) {
			
 
				+		int index_where_to_add;
			
 
				+		u64 srcdev_devid = dev_replace->srcdev->devid;
			
 
				+
			
 
				+		/*
			
 
				+		 * duplicate the write operations while the dev replace
			
 
				+		 * procedure is running. Since the copying of the old disk
			
 
				+		 * to the new disk takes place at run time while the
			
 
				+		 * filesystem is mounted writable, the regular write
			
 
				+		 * operations to the old disk have to be duplicated to go
			
 
				+		 * to the new disk as well.
			
 
				+		 * Note that device->missing is handled by the caller, and
			
 
				+		 * that the write to the old disk is already set up in the
			
 
				+		 * stripes array.
			
 
				+		 */
			
 
				+		index_where_to_add = num_stripes;
			
 
				+		for (i = 0; i < num_stripes; i++) {
			
 
				+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
			
 
				+				/* write to new disk, too */
			
 
				+				struct btrfs_bio_stripe *new =
			
 
				+					bbio->stripes + index_where_to_add;
			
 
				+				struct btrfs_bio_stripe *old =
			
 
				+					bbio->stripes + i;
			
 
				+
			
 
				+				new->physical = old->physical;
			
 
				+				new->length = old->length;
			
 
				+				new->dev = dev_replace->tgtdev;
			
 
				+				index_where_to_add++;
			
 
				+				max_errors++;
			
 
				+			}
			
 
				+		}
			
 
				+		num_stripes = index_where_to_add;
			
 
				+	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
			
 
				+		   dev_replace->tgtdev != NULL) {
			
 
				+		u64 srcdev_devid = dev_replace->srcdev->devid;
			
 
				+		int index_srcdev = 0;
			
 
				+		int found = 0;
			
 
				+		u64 physical_of_found = 0;
			
 
				+
			
 
				+		/*
			
 
				+		 * During the dev-replace procedure, the target drive can
			
 
				+		 * also be used to read data in case it is needed to repair
			
 
				+		 * a corrupt block elsewhere. This is possible if the
			
 
				+		 * requested area is left of the left cursor. In this area,
			
 
				+		 * the target drive is a full copy of the source drive.
			
 
				+		 */
			
 
				+		for (i = 0; i < num_stripes; i++) {
			
 
				+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
			
 
				+				/*
			
 
				+				 * In case of DUP, in order to keep it
			
 
				+				 * simple, only add the mirror with the
			
 
				+				 * lowest physical address
			
 
				+				 */
			
 
				+				if (found &&
			
 
				+				    physical_of_found <=
			
 
				+				     bbio->stripes[i].physical)
			
 
				+					continue;
			
 
				+				index_srcdev = i;
			
 
				+				found = 1;
			
 
				+				physical_of_found = bbio->stripes[i].physical;
			
 
				+			}
			
 
				+		}
			
 
				+		if (found) {
			
 
				+			u64 length = map->stripe_len;
			
 
				+
			
 
				+			if (physical_of_found + length <=
			
 
				+			    dev_replace->cursor_left) {
			
 
				+				struct btrfs_bio_stripe *tgtdev_stripe =
			
 
				+					bbio->stripes + num_stripes;
			
 
				+
			
 
				+				tgtdev_stripe->physical = physical_of_found;
			
 
				+				tgtdev_stripe->length =
			
 
				+					bbio->stripes[index_srcdev].length;
			
 
				+				tgtdev_stripe->dev = dev_replace->tgtdev;
			
 
				+
			
 
				+				num_stripes++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	*bbio_ret = bbio;
			
 
				 	bbio->num_stripes = num_stripes;
			
 
				 	bbio->max_errors = max_errors;
			
 
				 	bbio->mirror_num = mirror_num;
			
 
				+
			
 
				+	/*
			
 
				+	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
			
 
				+	 * mirror_num == num_stripes + 1 && dev_replace target drive is
			
 
				+	 * available as a mirror
			
 
				+	 */
			
 
				+	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
			
 
				+		WARN_ON(num_stripes > 1);
			
 
				+		bbio->stripes[0].dev = dev_replace->tgtdev;
			
 
				+		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
			
 
				+		bbio->mirror_num = map->num_stripes + 1;
			
 
				+	}
			
 
				 out:
			
 
				+	if (dev_replace_is_ongoing)
			
 
				+		btrfs_dev_replace_unlock(dev_replace);
			
 
				 	free_extent_map(em);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
			
 
				+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
			
 
				 		      u64 logical, u64 *length,
			
 
				 		      struct btrfs_bio **bbio_ret, int mirror_num)
			
 
				 {
			
 
				-	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
			
 
				+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
			
 
				 				 mirror_num);
			
 
				 }
			
 
				 
			
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
 
				 				   &device->work);
			
 
				 }
			
 
				 
			
 
				+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
			
 
				+		       sector_t sector)
			
 
				+{
			
 
				+	struct bio_vec *prev;
			
 
				+	struct request_queue *q = bdev_get_queue(bdev);
			
 
				+	unsigned short max_sectors = queue_max_sectors(q);
			
 
				+	struct bvec_merge_data bvm = {
			
 
				+		.bi_bdev = bdev,
			
 
				+		.bi_sector = sector,
			
 
				+		.bi_rw = bio->bi_rw,
			
 
				+	};
			
 
				+
			
 
				+	if (bio->bi_vcnt == 0) {
			
 
				+		WARN_ON(1);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
			
 
				+	if ((bio->bi_size >> 9) > max_sectors)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (!q->merge_bvec_fn)
			
 
				+		return 1;
			
 
				+
			
 
				+	bvm.bi_size = bio->bi_size - prev->bv_len;
			
 
				+	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
			
 
				+		return 0;
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
			
 
				+			      struct bio *bio, u64 physical, int dev_nr,
			
 
				+			      int rw, int async)
			
 
				+{
			
 
				+	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
			
 
				+
			
 
				+	bio->bi_private = bbio;
			
 
				+	bio->bi_private = merge_stripe_index_into_bio_private(
			
 
				+			bio->bi_private, (unsigned int)dev_nr);
			
 
				+	bio->bi_end_io = btrfs_end_bio;
			
 
				+	bio->bi_sector = physical >> 9;
			
 
				+#ifdef DEBUG
			
 
				+	{
			
 
				+		struct rcu_string *name;
			
 
				+
			
 
				+		rcu_read_lock();
			
 
				+		name = rcu_dereference(dev->name);
			
 
				+		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
			
 
				+			 "(%s id %llu), size=%u\n", rw,
			
 
				+			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
			
 
				+			 name->str, dev->devid, bio->bi_size);
			
 
				+		rcu_read_unlock();
			
 
				+	}
			
 
				+#endif
			
 
				+	bio->bi_bdev = dev->bdev;
			
 
				+	if (async)
			
 
				+		schedule_bio(root, dev, rw, bio);
			
 
				+	else
			
 
				+		btrfsic_submit_bio(rw, bio);
			
 
				+}
			
 
				+
			
 
				+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
			
 
				+			      struct bio *first_bio, struct btrfs_device *dev,
			
 
				+			      int dev_nr, int rw, int async)
			
 
				+{
			
 
				+	struct bio_vec *bvec = first_bio->bi_io_vec;
			
 
				+	struct bio *bio;
			
 
				+	int nr_vecs = bio_get_nr_vecs(dev->bdev);
			
 
				+	u64 physical = bbio->stripes[dev_nr].physical;
			
 
				+
			
 
				+again:
			
 
				+	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
			
 
				+	if (!bio)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
			
 
				+		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
			
 
				+				 bvec->bv_offset) < bvec->bv_len) {
			
 
				+			u64 len = bio->bi_size;
			
 
				+
			
 
				+			atomic_inc(&bbio->stripes_pending);
			
 
				+			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
			
 
				+					  rw, async);
			
 
				+			physical += len;
			
 
				+			goto again;
			
 
				+		}
			
 
				+		bvec++;
			
 
				+	}
			
 
				+
			
 
				+	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
			
 
				+{
			
 
				+	atomic_inc(&bbio->error);
			
 
				+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
			
 
				+		bio->bi_private = bbio->private;
			
 
				+		bio->bi_end_io = bbio->end_io;
			
 
				+		bio->bi_bdev = (struct block_device *)
			
 
				+			(unsigned long)bbio->mirror_num;
			
 
				+		bio->bi_sector = logical >> 9;
			
 
				+		kfree(bbio);
			
 
				+		bio_endio(bio, -EIO);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
			
 
				 		  int mirror_num, int async_submit)
			
 
				 {
			
 
				-	struct btrfs_mapping_tree *map_tree;
			
 
				 	struct btrfs_device *dev;
			
 
				 	struct bio *first_bio = bio;
			
 
				 	u64 logical = (u64)bio->bi_sector << 9;
			
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
				 	struct btrfs_bio *bbio = NULL;
			
 
				 
			
 
				 	length = bio->bi_size;
			
 
				-	map_tree = &root->fs_info->mapping_tree;
			
 
				 	map_length = length;
			
 
				 
			
 
				-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
			
 
				+	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
			
 
				 			      mirror_num);
			
 
				-	if (ret) /* -ENOMEM */
			
 
				+	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				 	total_devs = bbio->num_stripes;
			
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
				 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
			
 
				 
			
 
				 	while (dev_nr < total_devs) {
			
 
				+		dev = bbio->stripes[dev_nr].dev;
			
 
				+		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
			
 
				+			bbio_error(bbio, first_bio, logical);
			
 
				+			dev_nr++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Check and see if we're ok with this bio based on it's size
			
 
				+		 * and offset with the given device.
			
 
				+		 */
			
 
				+		if (!bio_size_ok(dev->bdev, first_bio,
			
 
				+				 bbio->stripes[dev_nr].physical >> 9)) {
			
 
				+			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
			
 
				+						 dev_nr, rw, async_submit);
			
 
				+			BUG_ON(ret);
			
 
				+			dev_nr++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				 		if (dev_nr < total_devs - 1) {
			
 
				 			bio = bio_clone(first_bio, GFP_NOFS);
			
 
				 			BUG_ON(!bio); /* -ENOMEM */
			
 
				 		} else {
			
 
				 			bio = first_bio;
			
 
				 		}
			
 
				-		bio->bi_private = bbio;
			
 
				-		bio->bi_private = merge_stripe_index_into_bio_private(
			
 
				-				bio->bi_private, (unsigned int)dev_nr);
			
 
				-		bio->bi_end_io = btrfs_end_bio;
			
 
				-		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
			
 
				-		dev = bbio->stripes[dev_nr].dev;
			
 
				-		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
			
 
				-#ifdef DEBUG
			
 
				-			struct rcu_string *name;
			
 
				-
			
 
				-			rcu_read_lock();
			
 
				-			name = rcu_dereference(dev->name);
			
 
				-			pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
			
 
				-				 "(%s id %llu), size=%u\n", rw,
			
 
				-				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
			
 
				-				 name->str, dev->devid, bio->bi_size);
			
 
				-			rcu_read_unlock();
			
 
				-#endif
			
 
				-			bio->bi_bdev = dev->bdev;
			
 
				-			if (async_submit)
			
 
				-				schedule_bio(root, dev, rw, bio);
			
 
				-			else
			
 
				-				btrfsic_submit_bio(rw, bio);
			
 
				-		} else {
			
 
				-			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				-			bio->bi_sector = logical >> 9;
			
 
				-			bio_endio(bio, -EIO);
			
 
				-		}
			
 
				+
			
 
				+		submit_stripe_bio(root, bbio, bio,
			
 
				+				  bbio->stripes[dev_nr].physical, dev_nr, rw,
			
 
				+				  async_submit);
			
 
				 		dev_nr++;
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
			
 
				+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
			
 
				 				       u8 *uuid, u8 *fsid)
			
 
				 {
			
 
				 	struct btrfs_device *device;
			
 
				 	struct btrfs_fs_devices *cur_devices;
			
 
				 
			
 
				-	cur_devices = root->fs_info->fs_devices;
			
 
				+	cur_devices = fs_info->fs_devices;
			
 
				 	while (cur_devices) {
			
 
				 		if (!fsid ||
			
 
				 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
			
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
				 	em->bdev = (struct block_device *)map;
			
 
				 	em->start = logical;
			
 
				 	em->len = length;
			
 
				+	em->orig_start = 0;
			
 
				 	em->block_start = 0;
			
 
				 	em->block_len = em->len;
			
 
				 
			
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
				 		read_extent_buffer(leaf, uuid, (unsigned long)
			
 
				 				   btrfs_stripe_dev_uuid_nr(chunk, i),
			
 
				 				   BTRFS_UUID_SIZE);
			
 
				-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
			
 
				-							NULL);
			
 
				+		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
			
 
				+							uuid, NULL);
			
 
				 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
			
 
				 			kfree(map);
			
 
				 			free_extent_map(em);
			
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
 
				 	device->io_align = btrfs_device_io_align(leaf, dev_item);
			
 
				 	device->io_width = btrfs_device_io_width(leaf, dev_item);
			
 
				 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
			
 
				+	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
			
 
				+	device->is_tgtdev_for_dev_replace = 0;
			
 
				 
			
 
				 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
			
 
				 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
			
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
 
				 			return ret;
			
 
				 	}
			
 
				 
			
 
				-	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
			
 
				+	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
			
 
				 	if (!device || !device->bdev) {
			
 
				 		if (!btrfs_test_opt(root, DEGRADED))
			
 
				 			return -EIO;
			
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
 
				 	fill_device_from_item(leaf, dev_item, device);
			
 
				 	device->dev_root = root->fs_info->dev_root;
			
 
				 	device->in_fs_metadata = 1;
			
 
				-	if (device->writeable) {
			
 
				+	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
			
 
				 		device->fs_devices->total_rw_bytes += device->total_bytes;
			
 
				 		spin_lock(&root->fs_info->free_chunk_lock);
			
 
				 		root->fs_info->free_chunk_space += device->total_bytes -
			
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 
				 	int i;
			
 
				 
			
 
				 	mutex_lock(&fs_devices->device_list_mutex);
			
 
				-	dev = btrfs_find_device(root, stats->devid, NULL, NULL);
			
 
				+	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
			
 
				 	mutex_unlock(&fs_devices->device_list_mutex);
			
 
				 
			
 
				 	if (!dev) {
			
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 
				 		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+int btrfs_scratch_superblock(struct btrfs_device *device)
			
 
				+{
			
 
				+	struct buffer_head *bh;
			
 
				+	struct btrfs_super_block *disk_super;
			
 
				+
			
 
				+	bh = btrfs_read_dev_super(device->bdev);
			
 
				+	if (!bh)
			
 
				+		return -EINVAL;
			
 
				+	disk_super = (struct btrfs_super_block *)bh->b_data;
			
 
				+
			
 
				+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
			
 
				+	set_buffer_dirty(bh);
			
 
				+	sync_dirty_buffer(bh);
			
 
				+	brelse(bh);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
 
				 	int in_fs_metadata;
			
 
				 	int missing;
			
 
				 	int can_discard;
			
 
				+	int is_tgtdev_for_dev_replace;
			
 
				 
			
 
				 	spinlock_t io_lock;
			
 
				 
			
@@ -88,7 +89,7 @@ struct btrfs_device {
 
				 	u8 uuid[BTRFS_UUID_SIZE];
			
 
				 
			
 
				 	/* per-device scrub information */
			
 
				-	struct scrub_dev *scrub_device;
			
 
				+	struct scrub_ctx *scrub_device;
			
 
				 
			
 
				 	struct btrfs_work work;
			
 
				 	struct rcu_head rcu;
			
@@ -179,6 +180,15 @@ struct btrfs_device_info {
 
				 	u64 total_avail;
			
 
				 };
			
 
				 
			
 
				+struct btrfs_raid_attr {
			
 
				+	int sub_stripes;	/* sub_stripes info for map */
			
 
				+	int dev_stripes;	/* stripes per dev */
			
 
				+	int devs_max;		/* max devs to use */
			
 
				+	int devs_min;		/* min devs needed */
			
 
				+	int devs_increment;	/* ndevs has to be a multiple of this */
			
 
				+	int ncopies;		/* how many copies to data has */
			
 
				+};
			
 
				+
			
 
				 struct map_lookup {
			
 
				 	u64 type;
			
 
				 	int io_align;
			
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 
				 			   struct btrfs_device *device,
			
 
				 			   u64 chunk_tree, u64 chunk_objectid,
			
 
				 			   u64 chunk_offset, u64 start, u64 num_bytes);
			
 
				-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
			
 
				+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
			
 
				 		    u64 logical, u64 *length,
			
 
				 		    struct btrfs_bio **bbio_ret, int mirror_num);
			
 
				 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
			
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
				 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
			
 
				 			  struct btrfs_fs_devices **fs_devices_ret);
			
 
				 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
			
 
				-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
			
 
				+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
			
 
				+			       struct btrfs_fs_devices *fs_devices, int step);
			
 
				+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
			
 
				+					 char *device_path,
			
 
				+					 struct btrfs_device **device);
			
 
				+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
			
 
				+			      struct btrfs_device **device);
			
 
				 int btrfs_add_device(struct btrfs_trans_handle *trans,
			
 
				 		     struct btrfs_root *root,
			
 
				 		     struct btrfs_device *device);
			
 
				 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
			
 
				 void btrfs_cleanup_fs_uuids(void);
			
 
				-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
			
 
				+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
			
 
				 int btrfs_grow_device(struct btrfs_trans_handle *trans,
			
 
				 		      struct btrfs_device *device, u64 new_size);
			
 
				-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
			
 
				+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
			
 
				 				       u8 *uuid, u8 *fsid);
			
 
				 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
			
 
				 int btrfs_init_new_device(struct btrfs_root *root, char *path);
			
 
				+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
			
 
				+				  struct btrfs_device **device_out);
			
 
				 int btrfs_balance(struct btrfs_balance_control *bctl,
			
 
				 		  struct btrfs_ioctl_balance_args *bargs);
			
 
				 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
			
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 
				 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			
 
				 			struct btrfs_fs_info *fs_info);
			
 
				+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
			
 
				+				 struct btrfs_device *srcdev);
			
 
				+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
			
 
				+				      struct btrfs_device *tgtdev);
			
 
				+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
			
 
				+					      struct btrfs_device *tgtdev);
			
 
				+int btrfs_scratch_superblock(struct btrfs_device *device);
			
 
				 
			
 
				 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
			
 
				 				      int index)
			
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 
				 		 */
			
 
				 		if (!value)
			
 
				 			goto out;
			
 
				+	} else {
			
 
				+		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
			
 
				+					name, name_len, 0);
			
 
				+		if (IS_ERR(di)) {
			
 
				+			ret = PTR_ERR(di);
			
 
				+			goto out;
			
 
				+		}
			
 
				+		if (!di && !value)
			
 
				+			goto out;
			
 
				+		btrfs_release_path(path);
			
 
				 	}
			
 
				 
			
 
				 again:
			
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	inode_inc_iversion(inode);
			
 
				 	inode->i_ctime = CURRENT_TIME;
			
 
				+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
			
 
				 	ret = btrfs_update_inode(trans, root, inode);
			
 
				 	BUG_ON(ret);
			
 
				 out:
			
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
				 
			
 
				 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
			
 
				 		if (verify_dir_item(root, leaf, di))
			
 
				-			continue;
			
 
				+			goto next;
			
 
				 
			
 
				 		name_len = btrfs_dir_name_len(leaf, di);
			
 
				 		total_size += name_len + 1;
			
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -45,7 +45,8 @@ struct extent_buffer;
 
				 
			
 
				 #define show_root_type(obj)						\
			
 
				 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
			
 
				-	      (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
			
 
				+	      (obj >= BTRFS_ROOT_TREE_OBJECTID &&			\
			
 
				+	       obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
			
 
				 
			
 
				 #define BTRFS_GROUP_FLAGS	\
			
 
				 	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"}, \