10 years ago · bdeb03cada
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
 
				 #include <linux/mutex.h>
			
 
				 #include <linux/genhd.h>
			
 
				 #include <linux/blkdev.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				 #include "ctree.h"
			
 
				 #include "disk-io.h"
			
 
				 #include "hash.h"
			
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
 
				 static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
			
 
				 			     struct btrfsic_block_data_ctx *block_ctx_out,
			
 
				 			     int mirror_num);
			
 
				-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
			
 
				-				  u32 len, struct block_device *bdev,
			
 
				-				  struct btrfsic_block_data_ctx *block_ctx_out);
			
 
				 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
			
 
				 static int btrfsic_read_block(struct btrfsic_state *state,
			
 
				 			      struct btrfsic_block_data_ctx *block_ctx);
			
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
 
				 		l = NULL;
			
 
				 		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
			
 
				 	} else {
			
 
				-		if (next_block->logical_bytenr != next_bytenr &&
			
 
				-		    !(!next_block->is_metadata &&
			
 
				-		      0 == next_block->logical_bytenr)) {
			
 
				-			printk(KERN_INFO
			
 
				-			       "Referenced block @%llu (%s/%llu/%d)"
			
 
				-			       " found in hash table, %c,"
			
 
				-			       " bytenr mismatch (!= stored %llu).\n",
			
 
				-			       next_bytenr, next_block_ctx->dev->name,
			
 
				-			       next_block_ctx->dev_bytenr, *mirror_nump,
			
 
				-			       btrfsic_get_block_type(state, next_block),
			
 
				-			       next_block->logical_bytenr);
			
 
				-		} else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
			
 
				-			printk(KERN_INFO
			
 
				-			       "Referenced block @%llu (%s/%llu/%d)"
			
 
				-			       " found in hash table, %c.\n",
			
 
				-			       next_bytenr, next_block_ctx->dev->name,
			
 
				-			       next_block_ctx->dev_bytenr, *mirror_nump,
			
 
				-			       btrfsic_get_block_type(state, next_block));
			
 
				+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
			
 
				+			if (next_block->logical_bytenr != next_bytenr &&
			
 
				+			    !(!next_block->is_metadata &&
			
 
				+			      0 == next_block->logical_bytenr))
			
 
				+				printk(KERN_INFO
			
 
				+				       "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
			
 
				+				       next_bytenr, next_block_ctx->dev->name,
			
 
				+				       next_block_ctx->dev_bytenr, *mirror_nump,
			
 
				+				       btrfsic_get_block_type(state,
			
 
				+							      next_block),
			
 
				+				       next_block->logical_bytenr);
			
 
				+			else
			
 
				+				printk(KERN_INFO
			
 
				+				       "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
			
 
				+				       next_bytenr, next_block_ctx->dev->name,
			
 
				+				       next_block_ctx->dev_bytenr, *mirror_nump,
			
 
				+				       btrfsic_get_block_type(state,
			
 
				+							      next_block));
			
 
				+		}
			
 
				 		next_block->logical_bytenr = next_bytenr;
			
 
				 
			
 
				 		next_block->mirror_num = *mirror_nump;
			
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
 
				 				return -1;
			
 
				 			}
			
 
				 			if (!block_was_created) {
			
 
				-				if (next_block->logical_bytenr != next_bytenr &&
			
 
				+				if ((state->print_mask &
			
 
				+				     BTRFSIC_PRINT_MASK_VERBOSE) &&
			
 
				+				    next_block->logical_bytenr != next_bytenr &&
			
 
				 				    !(!next_block->is_metadata &&
			
 
				 				      0 == next_block->logical_bytenr)) {
			
 
				 					printk(KERN_INFO
			
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
			
 
				-				  u32 len, struct block_device *bdev,
			
 
				-				  struct btrfsic_block_data_ctx *block_ctx_out)
			
 
				-{
			
 
				-	block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
			
 
				-	block_ctx_out->dev_bytenr = bytenr;
			
 
				-	block_ctx_out->start = bytenr;
			
 
				-	block_ctx_out->len = len;
			
 
				-	block_ctx_out->datav = NULL;
			
 
				-	block_ctx_out->pagev = NULL;
			
 
				-	block_ctx_out->mem_to_free = NULL;
			
 
				-	if (NULL != block_ctx_out->dev) {
			
 
				-		return 0;
			
 
				-	} else {
			
 
				-		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
			
 
				-		return -ENXIO;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
			
 
				 {
			
 
				 	if (block_ctx->mem_to_free) {
			
@@ -1901,25 +1883,26 @@ again:
 
				 							       dev_state,
			
 
				 							       dev_bytenr);
			
 
				 			}
			
 
				-			if (block->logical_bytenr != bytenr &&
			
 
				-			    !(!block->is_metadata &&
			
 
				-			      block->logical_bytenr == 0))
			
 
				-				printk(KERN_INFO
			
 
				-				       "Written block @%llu (%s/%llu/%d)"
			
 
				-				       " found in hash table, %c,"
			
 
				-				       " bytenr mismatch"
			
 
				-				       " (!= stored %llu).\n",
			
 
				-				       bytenr, dev_state->name, dev_bytenr,
			
 
				-				       block->mirror_num,
			
 
				-				       btrfsic_get_block_type(state, block),
			
 
				-				       block->logical_bytenr);
			
 
				-			else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
			
 
				-				printk(KERN_INFO
			
 
				-				       "Written block @%llu (%s/%llu/%d)"
			
 
				-				       " found in hash table, %c.\n",
			
 
				-				       bytenr, dev_state->name, dev_bytenr,
			
 
				-				       block->mirror_num,
			
 
				-				       btrfsic_get_block_type(state, block));
			
 
				+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
			
 
				+				if (block->logical_bytenr != bytenr &&
			
 
				+				    !(!block->is_metadata &&
			
 
				+				      block->logical_bytenr == 0))
			
 
				+					printk(KERN_INFO
			
 
				+					       "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
			
 
				+					       bytenr, dev_state->name,
			
 
				+					       dev_bytenr,
			
 
				+					       block->mirror_num,
			
 
				+					       btrfsic_get_block_type(state,
			
 
				+								      block),
			
 
				+					       block->logical_bytenr);
			
 
				+				else
			
 
				+					printk(KERN_INFO
			
 
				+					       "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
			
 
				+					       bytenr, dev_state->name,
			
 
				+					       dev_bytenr, block->mirror_num,
			
 
				+					       btrfsic_get_block_type(state,
			
 
				+								      block));
			
 
				+			}
			
 
				 			block->logical_bytenr = bytenr;
			
 
				 		} else {
			
 
				 			if (num_pages * PAGE_CACHE_SIZE <
			
@@ -2002,24 +1985,13 @@ again:
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		if (block->is_superblock)
			
 
				-			ret = btrfsic_map_superblock(state, bytenr,
			
 
				-						     processed_len,
			
 
				-						     bdev, &block_ctx);
			
 
				-		else
			
 
				-			ret = btrfsic_map_block(state, bytenr, processed_len,
			
 
				-						&block_ctx, 0);
			
 
				-		if (ret) {
			
 
				-			printk(KERN_INFO
			
 
				-			       "btrfsic: btrfsic_map_block(root @%llu)"
			
 
				-			       " failed!\n", bytenr);
			
 
				-			goto continue_loop;
			
 
				-		}
			
 
				-		block_ctx.datav = mapped_datav;
			
 
				-		/* the following is required in case of writes to mirrors,
			
 
				-		 * use the same that was used for the lookup */
			
 
				 		block_ctx.dev = dev_state;
			
 
				 		block_ctx.dev_bytenr = dev_bytenr;
			
 
				+		block_ctx.start = bytenr;
			
 
				+		block_ctx.len = processed_len;
			
 
				+		block_ctx.pagev = NULL;
			
 
				+		block_ctx.mem_to_free = NULL;
			
 
				+		block_ctx.datav = mapped_datav;
			
 
				 
			
 
				 		if (is_metadata || state->include_extent_data) {
			
 
				 			block->never_written = 0;
			
@@ -2133,10 +2105,6 @@ again:
 
				 			/* this is getting ugly for the
			
 
				 			 * include_extent_data case... */
			
 
				 			bytenr = 0;	/* unknown */
			
 
				-			block_ctx.start = bytenr;
			
 
				-			block_ctx.len = processed_len;
			
 
				-			block_ctx.mem_to_free = NULL;
			
 
				-			block_ctx.pagev = NULL;
			
 
				 		} else {
			
 
				 			processed_len = state->metablock_size;
			
 
				 			bytenr = btrfs_stack_header_bytenr(
			
@@ -2149,22 +2117,15 @@ again:
 
				 				       "Written block @%llu (%s/%llu/?)"
			
 
				 				       " !found in hash table, M.\n",
			
 
				 				       bytenr, dev_state->name, dev_bytenr);
			
 
				-
			
 
				-			ret = btrfsic_map_block(state, bytenr, processed_len,
			
 
				-						&block_ctx, 0);
			
 
				-			if (ret) {
			
 
				-				printk(KERN_INFO
			
 
				-				       "btrfsic: btrfsic_map_block(root @%llu)"
			
 
				-				       " failed!\n",
			
 
				-				       dev_bytenr);
			
 
				-				goto continue_loop;
			
 
				-			}
			
 
				 		}
			
 
				-		block_ctx.datav = mapped_datav;
			
 
				-		/* the following is required in case of writes to mirrors,
			
 
				-		 * use the same that was used for the lookup */
			
 
				+
			
 
				 		block_ctx.dev = dev_state;
			
 
				 		block_ctx.dev_bytenr = dev_bytenr;
			
 
				+		block_ctx.start = bytenr;
			
 
				+		block_ctx.len = processed_len;
			
 
				+		block_ctx.pagev = NULL;
			
 
				+		block_ctx.mem_to_free = NULL;
			
 
				+		block_ctx.datav = mapped_datav;
			
 
				 
			
 
				 		block = btrfsic_block_alloc();
			
 
				 		if (NULL == block) {
			
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
 
				 		       root->sectorsize, PAGE_CACHE_SIZE);
			
 
				 		return -1;
			
 
				 	}
			
 
				-	state = kzalloc(sizeof(*state), GFP_NOFS);
			
 
				-	if (NULL == state) {
			
 
				-		printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
			
 
				-		return -1;
			
 
				+	state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
			
 
				+	if (!state) {
			
 
				+		state = vzalloc(sizeof(*state));
			
 
				+		if (!state) {
			
 
				+			printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
			
 
				+			return -1;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	if (!btrfsic_is_initialized) {
			
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
 
				 
			
 
				 	mutex_unlock(&btrfsic_mutex);
			
 
				 
			
 
				-	kfree(state);
			
 
				+	if (is_vmalloc_addr(state))
			
 
				+		vfree(state);
			
 
				+	else
			
 
				+		kfree(state);
			
 
				 }
			
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
 
				  * Clear the writeback bits on all of the file
			
 
				  * pages for a compressed write
			
 
				  */
			
 
				-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
			
 
				-					      unsigned long ram_size)
			
 
				+static noinline void end_compressed_writeback(struct inode *inode,
			
 
				+					      const struct compressed_bio *cb)
			
 
				 {
			
 
				-	unsigned long index = start >> PAGE_CACHE_SHIFT;
			
 
				-	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
			
 
				+	unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
			
 
				+	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
			
 
				 	struct page *pages[16];
			
 
				 	unsigned long nr_pages = end_index - index + 1;
			
 
				 	int i;
			
 
				 	int ret;
			
 
				 
			
 
				+	if (cb->errors)
			
 
				+		mapping_set_error(inode->i_mapping, -EIO);
			
 
				+
			
 
				 	while (nr_pages > 0) {
			
 
				 		ret = find_get_pages_contig(inode->i_mapping, index,
			
 
				 				     min_t(unsigned long,
			
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
 
				 			continue;
			
 
				 		}
			
 
				 		for (i = 0; i < ret; i++) {
			
 
				+			if (cb->errors)
			
 
				+				SetPageError(pages[i]);
			
 
				 			end_page_writeback(pages[i]);
			
 
				 			page_cache_release(pages[i]);
			
 
				 		}
			
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
 
				 	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
			
 
				 					 cb->start,
			
 
				 					 cb->start + cb->len - 1,
			
 
				-					 NULL, 1);
			
 
				+					 NULL,
			
 
				+					 err ? 0 : 1);
			
 
				 	cb->compressed_pages[0]->mapping = NULL;
			
 
				 
			
 
				-	end_compressed_writeback(inode, cb->start, cb->len);
			
 
				+	end_compressed_writeback(inode, cb);
			
 
				 	/* note, our inode could be gone now */
			
 
				 
			
 
				 	/*
			
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
 
				 	 */
			
 
				 	if (!p->leave_spinning)
			
 
				 		btrfs_set_path_blocking(p);
			
 
				-	if (ret < 0)
			
 
				+	if (ret < 0 && !p->skip_release_on_error)
			
 
				 		btrfs_release_path(p);
			
 
				 	return ret;
			
 
				 }
			
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
 
				 	unsigned int leave_spinning:1;
			
 
				 	unsigned int search_commit_root:1;
			
 
				 	unsigned int need_commit_sem:1;
			
 
				+	unsigned int skip_release_on_error:1;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
 
				 	struct percpu_counter total_bytes_pinned;
			
 
				 
			
 
				 	struct list_head list;
			
 
				+	struct list_head ro_bgs;
			
 
				 
			
 
				 	struct rw_semaphore groups_sem;
			
 
				 	/* for block groups in our same type */
			
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
 
				 	unsigned int ro:1;
			
 
				 	unsigned int dirty:1;
			
 
				 	unsigned int iref:1;
			
 
				+	unsigned int has_caching_ctl:1;
			
 
				+	unsigned int removed:1;
			
 
				 
			
 
				 	int disk_cache_state;
			
 
				 
			
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
 
				 
			
 
				 	/* For delayed block group creation or deletion of empty block groups */
			
 
				 	struct list_head bg_list;
			
 
				+
			
 
				+	/* For read-only block groups */
			
 
				+	struct list_head ro_list;
			
 
				+
			
 
				+	atomic_t trimming;
			
 
				 };
			
 
				 
			
 
				 /* delayed seq elem */
			
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
 
				 	 */
			
 
				 	u64 last_trans_log_full_commit;
			
 
				 	unsigned long mount_opt;
			
 
				+	/*
			
 
				+	 * Track requests for actions that need to be done during transaction
			
 
				+	 * commit (like for some mount options).
			
 
				+	 */
			
 
				+	unsigned long pending_changes;
			
 
				 	unsigned long compress_type:4;
			
 
				 	int commit_interval;
			
 
				 	/*
			
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
 
				 
			
 
				 	/* For btrfs to record security options */
			
 
				 	struct security_mnt_opts security_opts;
			
 
				+
			
 
				+	/*
			
 
				+	 * Chunks that can't be freed yet (under a trim/discard operation)
			
 
				+	 * and will be latter freed. Protected by fs_info->chunk_mutex.
			
 
				+	 */
			
 
				+	struct list_head pinned_chunks;
			
 
				 };
			
 
				 
			
 
				 struct btrfs_subvolume_writers {
			
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
 
				 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
			
 
				 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
			
 
				 #define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
			
 
				-#define	BTRFS_MOUNT_CHANGE_INODE_CACHE	(1 << 24)
			
 
				 
			
 
				 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
			
 
				 #define BTRFS_DEFAULT_MAX_INLINE	(8192)
			
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
 
				 #define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
			
 
				 #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
			
 
				 					 BTRFS_MOUNT_##opt)
			
 
				+
			
 
				 #define btrfs_set_and_info(root, opt, fmt, args...)			\
			
 
				 {									\
			
 
				 	if (!btrfs_test_opt(root, opt))					\
			
@@ -2117,6 +2137,49 @@ struct btrfs_ioctl_defrag_range_args {
 
				 	btrfs_clear_opt(root->fs_info->mount_opt, opt);			\
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Requests for changes that need to be done during transaction commit.
			
 
				+ *
			
 
				+ * Internal mount options that are used for special handling of the real
			
 
				+ * mount options (eg. cannot be set during remount and have to be set during
			
 
				+ * transaction commit)
			
 
				+ */
			
 
				+
			
 
				+#define BTRFS_PENDING_SET_INODE_MAP_CACHE	(0)
			
 
				+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE	(1)
			
 
				+#define BTRFS_PENDING_COMMIT			(2)
			
 
				+
			
 
				+#define btrfs_test_pending(info, opt)	\
			
 
				+	test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
			
 
				+#define btrfs_set_pending(info, opt)	\
			
 
				+	set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
			
 
				+#define btrfs_clear_pending(info, opt)	\
			
 
				+	clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
			
 
				+
			
 
				+/*
			
 
				+ * Helpers for setting pending mount option changes.
			
 
				+ *
			
 
				+ * Expects corresponding macros
			
 
				+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
			
 
				+ */
			
 
				+#define btrfs_set_pending_and_info(info, opt, fmt, args...)            \
			
 
				+do {                                                                   \
			
 
				+       if (!btrfs_raw_test_opt((info)->mount_opt, opt)) {              \
			
 
				+               btrfs_info((info), fmt, ##args);                        \
			
 
				+               btrfs_set_pending((info), SET_##opt);                   \
			
 
				+               btrfs_clear_pending((info), CLEAR_##opt);               \
			
 
				+       }                                                               \
			
 
				+} while(0)
			
 
				+
			
 
				+#define btrfs_clear_pending_and_info(info, opt, fmt, args...)          \
			
 
				+do {                                                                   \
			
 
				+       if (btrfs_raw_test_opt((info)->mount_opt, opt)) {               \
			
 
				+               btrfs_info((info), fmt, ##args);                        \
			
 
				+               btrfs_set_pending((info), CLEAR_##opt);                 \
			
 
				+               btrfs_clear_pending((info), SET_##opt);                 \
			
 
				+       }                                                               \
			
 
				+} while(0)
			
 
				+
			
 
				 /*
			
 
				  * Inode flags
			
 
				  */
			
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
				 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
			
 
				 			   u64 size);
			
 
				 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
			
 
				-			     struct btrfs_root *root, u64 group_start);
			
 
				+			     struct btrfs_root *root, u64 group_start,
			
 
				+			     struct extent_map *em);
			
 
				 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
			
 
				 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
			
 
				 				       struct btrfs_root *root);
			
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 
				 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
			
 
				 					 struct btrfs_fs_info *fs_info);
			
 
				 int __get_raid_index(u64 flags);
			
 
				-int btrfs_start_nocow_write(struct btrfs_root *root);
			
 
				-void btrfs_end_nocow_write(struct btrfs_root *root);
			
 
				+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
			
 
				+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
			
 
				 /* ctree.c */
			
 
				 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
			
 
				 		     int level, int *slot);
			
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 
				 int verify_dir_item(struct btrfs_root *root,
			
 
				 		    struct extent_buffer *leaf,
			
 
				 		    struct btrfs_dir_item *dir_item);
			
 
				+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
			
 
				+						 struct btrfs_path *path,
			
 
				+						 const char *name,
			
 
				+						 int name_len);
			
 
				 
			
 
				 /* orphan.c */
			
 
				 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
			
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 
				 				    struct btrfs_trans_handle *trans, int mode,
			
 
				 				    u64 start, u64 num_bytes, u64 min_size,
			
 
				 				    loff_t actual_len, u64 *alloc_hint);
			
 
				+int btrfs_inode_check_errors(struct inode *inode);
			
 
				 extern const struct dentry_operations btrfs_dentry_operations;
			
 
				 
			
 
				 /* ioctl.c */
			
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 
				 		      struct page **pages, size_t num_pages,
			
 
				 		      loff_t pos, size_t write_bytes,
			
 
				 		      struct extent_state **cached);
			
 
				+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
			
 
				 
			
 
				 /* tree-defrag.c */
			
 
				 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
			
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 
				 /* dev-replace.c */
			
 
				 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
			
 
				 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
			
 
				-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
			
 
				+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
			
 
				+
			
 
				+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	btrfs_bio_counter_sub(fs_info, 1);
			
 
				+}
			
 
				 
			
 
				 /* reada.c */
			
 
				 struct reada_control {
			
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 
				 	struct btrfs_device *tgt_device = NULL;
			
 
				 	struct btrfs_device *src_device = NULL;
			
 
				 
			
 
				-	if (btrfs_fs_incompat(fs_info, RAID56)) {
			
 
				-		btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
			
 
				-		return -EOPNOTSUPP;
			
 
				-	}
			
 
				-
			
 
				 	switch (args->start.cont_reading_from_srcdev_mode) {
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
			
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 
				 			      &dev_replace->scrub_progress, 0, 1);
			
 
				 
			
 
				 	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
			
 
				-	WARN_ON(ret);
			
 
				+	/* don't warn if EINPROGRESS, someone else might be running scrub */
			
 
				+	if (ret == -EINPROGRESS) {
			
 
				+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
			
 
				+		ret = 0;
			
 
				+	} else {
			
 
				+		WARN_ON(ret);
			
 
				+	}
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 
			
 
				 leave:
			
 
				 	dev_replace->srcdev = NULL;
			
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
			
 
				 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				 
			
 
				-		return 0;
			
 
				+		return scrub_ret;
			
 
				 	}
			
 
				 
			
 
				 	printk_in_rcu(KERN_INFO
			
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
			
 
				 	fs_info->fs_devices->rw_devices++;
			
 
				 
			
 
				-	/* replace the sysfs entry */
			
 
				-	btrfs_kobj_rm_device(fs_info, src_device);
			
 
				-	btrfs_kobj_add_device(fs_info, tgt_device);
			
 
				-
			
 
				 	btrfs_dev_replace_unlock(dev_replace);
			
 
				 
			
 
				 	btrfs_rm_dev_replace_blocked(fs_info);
			
 
				 
			
 
				-	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
			
 
				+	btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
			
 
				 
			
 
				 	btrfs_rm_dev_replace_unblocked(fs_info);
			
 
				 
			
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 	mutex_unlock(&uuid_mutex);
			
 
				 
			
 
				+	/* replace the sysfs entry */
			
 
				+	btrfs_kobj_rm_device(fs_info, src_device);
			
 
				+	btrfs_kobj_add_device(fs_info, tgt_device);
			
 
				+	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
			
 
				+
			
 
				 	/* write back the superblocks */
			
 
				 	trans = btrfs_start_transaction(root, 0);
			
 
				 	if (!IS_ERR(trans))
			
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
 
				 	percpu_counter_inc(&fs_info->bio_counter);
			
 
				 }
			
 
				 
			
 
				-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
			
 
				+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
			
 
				 {
			
 
				-	percpu_counter_dec(&fs_info->bio_counter);
			
 
				+	percpu_counter_sub(&fs_info->bio_counter, amount);
			
 
				 
			
 
				 	if (waitqueue_active(&fs_info->replace_wait))
			
 
				 		wake_up(&fs_info->replace_wait);
			
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
 
				 #include "hash.h"
			
 
				 #include "transaction.h"
			
 
				 
			
 
				-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
			
 
				-			      struct btrfs_path *path,
			
 
				-			      const char *name, int name_len);
			
 
				-
			
 
				 /*
			
 
				  * insert a name into a directory, doing overflow properly if there is a hash
			
 
				  * collision.  data_size indicates how big the item inserted should be.  On
			
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 
				  * this walks through all the entries in a dir item and finds one
			
 
				  * for a specific name.
			
 
				  */
			
 
				-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
			
 
				-			      struct btrfs_path *path,
			
 
				-			      const char *name, int name_len)
			
 
				+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
			
 
				+						 struct btrfs_path *path,
			
 
				+						 const char *name, int name_len)
			
 
				 {
			
 
				 	struct btrfs_dir_item *dir_item;
			
 
				 	unsigned long name_ptr;
			
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
 
				 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
			
 
				 	init_waitqueue_head(&fs_info->async_submit_wait);
			
 
				 
			
 
				+	INIT_LIST_HEAD(&fs_info->pinned_chunks);
			
 
				+
			
 
				 	ret = btrfs_alloc_stripe_hash_table(fs_info);
			
 
				 	if (ret) {
			
 
				 		err = ret;
			
@@ -2830,9 +2832,11 @@ retry_root_backup:
 
				 		btrfs_set_opt(fs_info->mount_opt, SSD);
			
 
				 	}
			
 
				 
			
 
				-	/* Set the real inode map cache flag */
			
 
				-	if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
			
 
				-		btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
			
 
				+	/*
			
 
				+	 * Mount does not set all options immediatelly, we can do it now and do
			
 
				+	 * not have to wait for transaction commit
			
 
				+	 */
			
 
				+	btrfs_apply_pending_changes(fs_info);
			
 
				 
			
 
				 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
			
 
				 	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
			
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
 
				 
			
 
				 	btrfs_free_block_rsv(root, root->orphan_block_rsv);
			
 
				 	root->orphan_block_rsv = NULL;
			
 
				+
			
 
				+	lock_chunks(root);
			
 
				+	while (!list_empty(&fs_info->pinned_chunks)) {
			
 
				+		struct extent_map *em;
			
 
				+
			
 
				+		em = list_first_entry(&fs_info->pinned_chunks,
			
 
				+				      struct extent_map, list);
			
 
				+		list_del_init(&em->list);
			
 
				+		free_extent_map(em);
			
 
				+	}
			
 
				+	unlock_chunks(root);
			
 
				 }
			
 
				 
			
 
				 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
			
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 
				 	 */
			
 
				 	if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
			
 
				 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
			
 
				-				sb->root);
			
 
				+				btrfs_super_root(sb));
			
 
				 	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
			
 
				-		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
			
 
				-				sb->chunk_root);
			
 
				+		printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
			
 
				+				btrfs_super_chunk_root(sb));
			
 
				 	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
			
 
				-		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
			
 
				+		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
			
 
				 				btrfs_super_log_root(sb));
			
 
				 
			
 
				 	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
			
@@ -4129,6 +4144,25 @@ again:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
			
 
				+				       struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_ordered_extent *ordered;
			
 
				+
			
 
				+	spin_lock(&fs_info->trans_lock);
			
 
				+	while (!list_empty(&cur_trans->pending_ordered)) {
			
 
				+		ordered = list_first_entry(&cur_trans->pending_ordered,
			
 
				+					   struct btrfs_ordered_extent,
			
 
				+					   trans_list);
			
 
				+		list_del_init(&ordered->trans_list);
			
 
				+		spin_unlock(&fs_info->trans_lock);
			
 
				+
			
 
				+		btrfs_put_ordered_extent(ordered);
			
 
				+		spin_lock(&fs_info->trans_lock);
			
 
				+	}
			
 
				+	spin_unlock(&fs_info->trans_lock);
			
 
				+}
			
 
				+
			
 
				 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
			
 
				 				   struct btrfs_root *root)
			
 
				 {
			
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 
				 	cur_trans->state = TRANS_STATE_UNBLOCKED;
			
 
				 	wake_up(&root->fs_info->transaction_wait);
			
 
				 
			
 
				+	btrfs_free_pending_ordered(cur_trans, root->fs_info);
			
 
				 	btrfs_destroy_delayed_inodes(root);
			
 
				 	btrfs_assert_delayed_root_empty(root);
			
 
				 
			
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
 
				 	struct btrfs_caching_control *ctl;
			
 
				 
			
 
				 	spin_lock(&cache->lock);
			
 
				-	if (cache->cached != BTRFS_CACHE_STARTED) {
			
 
				-		spin_unlock(&cache->lock);
			
 
				-		return NULL;
			
 
				-	}
			
 
				-
			
 
				-	/* We're loading it the fast way, so we don't have a caching_ctl. */
			
 
				 	if (!cache->caching_ctl) {
			
 
				 		spin_unlock(&cache->lock);
			
 
				 		return NULL;
			
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
				 	spin_unlock(&cache->lock);
			
 
				 
			
 
				 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
			
 
				+		mutex_lock(&caching_ctl->mutex);
			
 
				 		ret = load_free_space_cache(fs_info, cache);
			
 
				 
			
 
				 		spin_lock(&cache->lock);
			
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
				 			cache->caching_ctl = NULL;
			
 
				 			cache->cached = BTRFS_CACHE_FINISHED;
			
 
				 			cache->last_byte_to_unpin = (u64)-1;
			
 
				+			caching_ctl->progress = (u64)-1;
			
 
				 		} else {
			
 
				 			if (load_cache_only) {
			
 
				 				cache->caching_ctl = NULL;
			
 
				 				cache->cached = BTRFS_CACHE_NO;
			
 
				 			} else {
			
 
				 				cache->cached = BTRFS_CACHE_STARTED;
			
 
				+				cache->has_caching_ctl = 1;
			
 
				 			}
			
 
				 		}
			
 
				 		spin_unlock(&cache->lock);
			
 
				+		mutex_unlock(&caching_ctl->mutex);
			
 
				+
			
 
				 		wake_up(&caching_ctl->wait);
			
 
				 		if (ret == 1) {
			
 
				 			put_caching_control(caching_ctl);
			
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
				 			cache->cached = BTRFS_CACHE_NO;
			
 
				 		} else {
			
 
				 			cache->cached = BTRFS_CACHE_STARTED;
			
 
				+			cache->has_caching_ctl = 1;
			
 
				 		}
			
 
				 		spin_unlock(&cache->lock);
			
 
				 		wake_up(&caching_ctl->wait);
			
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
 
				 		 struct btrfs_block_group_cache *cache)
			
 
				 {
			
 
				 	struct rb_node *node;
			
 
				+
			
 
				 	spin_lock(&root->fs_info->block_group_cache_lock);
			
 
				+
			
 
				+	/* If our block group was removed, we need a full search. */
			
 
				+	if (RB_EMPTY_NODE(&cache->cache_node)) {
			
 
				+		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
			
 
				+
			
 
				+		spin_unlock(&root->fs_info->block_group_cache_lock);
			
 
				+		btrfs_put_block_group(cache);
			
 
				+		cache = btrfs_lookup_first_block_group(root->fs_info,
			
 
				+						       next_bytenr);
			
 
				+		return cache;
			
 
				+	}
			
 
				 	node = rb_next(&cache->cache_node);
			
 
				 	btrfs_put_block_group(cache);
			
 
				 	if (node) {
			
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
				 	found->chunk_alloc = 0;
			
 
				 	found->flush = 0;
			
 
				 	init_waitqueue_head(&found->wait);
			
 
				+	INIT_LIST_HEAD(&found->ro_bgs);
			
 
				 
			
 
				 	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
			
 
				 				    info->space_info_kobj, "%s",
			
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
 
				 			spin_unlock(&cache->space_info->lock);
			
 
				 		} else {
			
 
				 			old_val -= num_bytes;
			
 
				+			btrfs_set_block_group_used(&cache->item, old_val);
			
 
				+			cache->pinned += num_bytes;
			
 
				+			cache->space_info->bytes_pinned += num_bytes;
			
 
				+			cache->space_info->bytes_used -= num_bytes;
			
 
				+			cache->space_info->disk_used -= num_bytes * factor;
			
 
				+			spin_unlock(&cache->lock);
			
 
				+			spin_unlock(&cache->space_info->lock);
			
 
				 
			
 
				+			set_extent_dirty(info->pinned_extents,
			
 
				+					 bytenr, bytenr + num_bytes - 1,
			
 
				+					 GFP_NOFS | __GFP_NOFAIL);
			
 
				 			/*
			
 
				 			 * No longer have used bytes in this block group, queue
			
 
				 			 * it for deletion.
			
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
 
				 				}
			
 
				 				spin_unlock(&info->unused_bgs_lock);
			
 
				 			}
			
 
				-			btrfs_set_block_group_used(&cache->item, old_val);
			
 
				-			cache->pinned += num_bytes;
			
 
				-			cache->space_info->bytes_pinned += num_bytes;
			
 
				-			cache->space_info->bytes_used -= num_bytes;
			
 
				-			cache->space_info->disk_used -= num_bytes * factor;
			
 
				-			spin_unlock(&cache->lock);
			
 
				-			spin_unlock(&cache->space_info->lock);
			
 
				-
			
 
				-			set_extent_dirty(info->pinned_extents,
			
 
				-					 bytenr, bytenr + num_bytes - 1,
			
 
				-					 GFP_NOFS | __GFP_NOFAIL);
			
 
				 		}
			
 
				 		btrfs_put_block_group(cache);
			
 
				 		total -= num_bytes;
			
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 
				 	    min_allocable_bytes <= sinfo->total_bytes) {
			
 
				 		sinfo->bytes_readonly += num_bytes;
			
 
				 		cache->ro = 1;
			
 
				+		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
			
 
				 		ret = 0;
			
 
				 	}
			
 
				 out:
			
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 
				 
			
 
				 /*
			
 
				  * helper to account the unused space of all the readonly block group in the
			
 
				- * list. takes mirrors into account.
			
 
				+ * space_info. takes mirrors into account.
			
 
				  */
			
 
				-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
			
 
				+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
			
 
				 {
			
 
				 	struct btrfs_block_group_cache *block_group;
			
 
				 	u64 free_bytes = 0;
			
 
				 	int factor;
			
 
				 
			
 
				-	list_for_each_entry(block_group, groups_list, list) {
			
 
				+	/* It's df, we don't care if it's racey */
			
 
				+	if (list_empty(&sinfo->ro_bgs))
			
 
				+		return 0;
			
 
				+
			
 
				+	spin_lock(&sinfo->lock);
			
 
				+	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
			
 
				 		spin_lock(&block_group->lock);
			
 
				 
			
 
				 		if (!block_group->ro) {
			
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
 
				 
			
 
				 		spin_unlock(&block_group->lock);
			
 
				 	}
			
 
				-
			
 
				-	return free_bytes;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * helper to account the unused space of all the readonly block group in the
			
 
				- * space_info. takes mirrors into account.
			
 
				- */
			
 
				-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
			
 
				-{
			
 
				-	int i;
			
 
				-	u64 free_bytes = 0;
			
 
				-
			
 
				-	spin_lock(&sinfo->lock);
			
 
				-
			
 
				-	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
			
 
				-		if (!list_empty(&sinfo->block_groups[i]))
			
 
				-			free_bytes += __btrfs_get_ro_block_group_free_space(
			
 
				-						&sinfo->block_groups[i]);
			
 
				-
			
 
				 	spin_unlock(&sinfo->lock);
			
 
				 
			
 
				 	return free_bytes;
			
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
 
				 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
			
 
				 	sinfo->bytes_readonly -= num_bytes;
			
 
				 	cache->ro = 0;
			
 
				+	list_del_init(&cache->ro_list);
			
 
				 	spin_unlock(&cache->lock);
			
 
				 	spin_unlock(&sinfo->lock);
			
 
				 }
			
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 
				 	INIT_LIST_HEAD(&cache->list);
			
 
				 	INIT_LIST_HEAD(&cache->cluster_list);
			
 
				 	INIT_LIST_HEAD(&cache->bg_list);
			
 
				+	INIT_LIST_HEAD(&cache->ro_list);
			
 
				 	btrfs_init_free_space_ctl(cache);
			
 
				+	atomic_set(&cache->trimming, 0);
			
 
				 
			
 
				 	return cache;
			
 
				 }
			
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 
				 	int ret = 0;
			
 
				 
			
 
				 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
			
 
				-		list_del_init(&block_group->bg_list);
			
 
				 		if (ret)
			
 
				-			continue;
			
 
				+			goto next;
			
 
				 
			
 
				 		spin_lock(&block_group->lock);
			
 
				 		memcpy(&item, &block_group->item, sizeof(item));
			
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 
				 					       key.objectid, key.offset);
			
 
				 		if (ret)
			
 
				 			btrfs_abort_transaction(trans, extent_root, ret);
			
 
				+next:
			
 
				+		list_del_init(&block_group->bg_list);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 
				 }
			
 
				 
			
 
				 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
			
 
				-			     struct btrfs_root *root, u64 group_start)
			
 
				+			     struct btrfs_root *root, u64 group_start,
			
 
				+			     struct extent_map *em)
			
 
				 {
			
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_block_group_cache *block_group;
			
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 	int ret;
			
 
				 	int index;
			
 
				 	int factor;
			
 
				+	struct btrfs_caching_control *caching_ctl = NULL;
			
 
				+	bool remove_em;
			
 
				 
			
 
				 	root = root->fs_info->extent_root;
			
 
				 
			
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 	spin_lock(&root->fs_info->block_group_cache_lock);
			
 
				 	rb_erase(&block_group->cache_node,
			
 
				 		 &root->fs_info->block_group_cache_tree);
			
 
				+	RB_CLEAR_NODE(&block_group->cache_node);
			
 
				 
			
 
				 	if (root->fs_info->first_logical_byte == block_group->key.objectid)
			
 
				 		root->fs_info->first_logical_byte = (u64)-1;
			
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 	 * are still on the list after taking the semaphore
			
 
				 	 */
			
 
				 	list_del_init(&block_group->list);
			
 
				+	list_del_init(&block_group->ro_list);
			
 
				 	if (list_empty(&block_group->space_info->block_groups[index])) {
			
 
				 		kobj = block_group->space_info->block_group_kobjs[index];
			
 
				 		block_group->space_info->block_group_kobjs[index] = NULL;
			
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 		kobject_put(kobj);
			
 
				 	}
			
 
				 
			
 
				+	if (block_group->has_caching_ctl)
			
 
				+		caching_ctl = get_caching_control(block_group);
			
 
				 	if (block_group->cached == BTRFS_CACHE_STARTED)
			
 
				 		wait_block_group_cache_done(block_group);
			
 
				+	if (block_group->has_caching_ctl) {
			
 
				+		down_write(&root->fs_info->commit_root_sem);
			
 
				+		if (!caching_ctl) {
			
 
				+			struct btrfs_caching_control *ctl;
			
 
				+
			
 
				+			list_for_each_entry(ctl,
			
 
				+				    &root->fs_info->caching_block_groups, list)
			
 
				+				if (ctl->block_group == block_group) {
			
 
				+					caching_ctl = ctl;
			
 
				+					atomic_inc(&caching_ctl->count);
			
 
				+					break;
			
 
				+				}
			
 
				+		}
			
 
				+		if (caching_ctl)
			
 
				+			list_del_init(&caching_ctl->list);
			
 
				+		up_write(&root->fs_info->commit_root_sem);
			
 
				+		if (caching_ctl) {
			
 
				+			/* Once for the caching bgs list and once for us. */
			
 
				+			put_caching_control(caching_ctl);
			
 
				+			put_caching_control(caching_ctl);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	btrfs_remove_free_space_cache(block_group);
			
 
				 
			
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	memcpy(&key, &block_group->key, sizeof(key));
			
 
				 
			
 
				+	lock_chunks(root);
			
 
				+	if (!list_empty(&em->list)) {
			
 
				+		/* We're in the transaction->pending_chunks list. */
			
 
				+		free_extent_map(em);
			
 
				+	}
			
 
				+	spin_lock(&block_group->lock);
			
 
				+	block_group->removed = 1;
			
 
				+	/*
			
 
				+	 * At this point trimming can't start on this block group, because we
			
 
				+	 * removed the block group from the tree fs_info->block_group_cache_tree
			
 
				+	 * so no one can't find it anymore and even if someone already got this
			
 
				+	 * block group before we removed it from the rbtree, they have already
			
 
				+	 * incremented block_group->trimming - if they didn't, they won't find
			
 
				+	 * any free space entries because we already removed them all when we
			
 
				+	 * called btrfs_remove_free_space_cache().
			
 
				+	 *
			
 
				+	 * And we must not remove the extent map from the fs_info->mapping_tree
			
 
				+	 * to prevent the same logical address range and physical device space
			
 
				+	 * ranges from being reused for a new block group. This is because our
			
 
				+	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
			
 
				+	 * completely transactionless, so while it is trimming a range the
			
 
				+	 * currently running transaction might finish and a new one start,
			
 
				+	 * allowing for new block groups to be created that can reuse the same
			
 
				+	 * physical device locations unless we take this special care.
			
 
				+	 */
			
 
				+	remove_em = (atomic_read(&block_group->trimming) == 0);
			
 
				+	/*
			
 
				+	 * Make sure a trimmer task always sees the em in the pinned_chunks list
			
 
				+	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
			
 
				+	 * before checking block_group->removed).
			
 
				+	 */
			
 
				+	if (!remove_em) {
			
 
				+		/*
			
 
				+		 * Our em might be in trans->transaction->pending_chunks which
			
 
				+		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
			
 
				+		 * and so is the fs_info->pinned_chunks list.
			
 
				+		 *
			
 
				+		 * So at this point we must be holding the chunk_mutex to avoid
			
 
				+		 * any races with chunk allocation (more specifically at
			
 
				+		 * volumes.c:contains_pending_extent()), to ensure it always
			
 
				+		 * sees the em, either in the pending_chunks list or in the
			
 
				+		 * pinned_chunks list.
			
 
				+		 */
			
 
				+		list_move_tail(&em->list, &root->fs_info->pinned_chunks);
			
 
				+	}
			
 
				+	spin_unlock(&block_group->lock);
			
 
				+
			
 
				+	if (remove_em) {
			
 
				+		struct extent_map_tree *em_tree;
			
 
				+
			
 
				+		em_tree = &root->fs_info->mapping_tree.map_tree;
			
 
				+		write_lock(&em_tree->lock);
			
 
				+		/*
			
 
				+		 * The em might be in the pending_chunks list, so make sure the
			
 
				+		 * chunk mutex is locked, since remove_extent_mapping() will
			
 
				+		 * delete us from that list.
			
 
				+		 */
			
 
				+		remove_extent_mapping(em_tree, em);
			
 
				+		write_unlock(&em_tree->lock);
			
 
				+		/* once for the tree */
			
 
				+		free_extent_map(em);
			
 
				+	}
			
 
				+
			
 
				+	unlock_chunks(root);
			
 
				+
			
 
				 	btrfs_put_block_group(block_group);
			
 
				 	btrfs_put_block_group(block_group);
			
 
				 
			
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 
				 		 */
			
 
				 		start = block_group->key.objectid;
			
 
				 		end = start + block_group->key.offset - 1;
			
 
				-		clear_extent_bits(&fs_info->freed_extents[0], start, end,
			
 
				+		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
			
 
				 				  EXTENT_DIRTY, GFP_NOFS);
			
 
				-		clear_extent_bits(&fs_info->freed_extents[1], start, end,
			
 
				+		if (ret) {
			
 
				+			btrfs_set_block_group_rw(root, block_group);
			
 
				+			goto end_trans;
			
 
				+		}
			
 
				+		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
			
 
				 				  EXTENT_DIRTY, GFP_NOFS);
			
 
				+		if (ret) {
			
 
				+			btrfs_set_block_group_rw(root, block_group);
			
 
				+			goto end_trans;
			
 
				+		}
			
 
				 
			
 
				 		/* Reset pinned so btrfs_put_block_group doesn't complain */
			
 
				 		block_group->pinned = 0;
			
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 
				 		 */
			
 
				 		ret = btrfs_remove_chunk(trans, root,
			
 
				 					 block_group->key.objectid);
			
 
				+end_trans:
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				 next:
			
 
				 		btrfs_put_block_group(block_group);
			
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
			
 
				- * they are used to prevent the some tasks writing data into the page cache
			
 
				- * by nocow before the subvolume is snapshoted, but flush the data into
			
 
				- * the disk after the snapshot creation.
			
 
				+ * btrfs_{start,end}_write_no_snapshoting() are similar to
			
 
				+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
			
 
				+ * data into the page cache through nocow before the subvolume is snapshoted,
			
 
				+ * but flush the data into disk after the snapshot creation, or to prevent
			
 
				+ * operations while snapshoting is ongoing and that cause the snapshot to be
			
 
				+ * inconsistent (writes followed by expanding truncates for example).
			
 
				  */
			
 
				-void btrfs_end_nocow_write(struct btrfs_root *root)
			
 
				+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
			
 
				 {
			
 
				 	percpu_counter_dec(&root->subv_writers->counter);
			
 
				 	/*
			
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
 
				 		wake_up(&root->subv_writers->wait);
			
 
				 }
			
 
				 
			
 
				-int btrfs_start_nocow_write(struct btrfs_root *root)
			
 
				+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
			
 
				 {
			
 
				 	if (atomic_read(&root->will_be_snapshoted))
			
 
				 		return 0;
			
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
 
				 	 */
			
 
				 	smp_mb();
			
 
				 	if (atomic_read(&root->will_be_snapshoted)) {
			
 
				-		btrfs_end_nocow_write(root);
			
 
				+		btrfs_end_write_no_snapshoting(root);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	return 1;
			
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
				 		clear = 1;
			
 
				 again:
			
 
				 	if (!prealloc && (mask & __GFP_WAIT)) {
			
 
				+		/*
			
 
				+		 * Don't care for allocation failure here because we might end
			
 
				+		 * up not needing the pre-allocated extent state at all, which
			
 
				+		 * is the case if we only have in the tree extent states that
			
 
				+		 * cover our input range and don't cover too any other range.
			
 
				+		 * If we end up needing a new extent state we allocate it later.
			
 
				+		 */
			
 
				 		prealloc = alloc_extent_state(mask);
			
 
				-		if (!prealloc)
			
 
				-			return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				 	spin_lock(&tree->lock);
			
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
 
				 	state->state |= bits_to_set;
			
 
				 }
			
 
				 
			
 
				-static void cache_state(struct extent_state *state,
			
 
				-			struct extent_state **cached_ptr)
			
 
				+static void cache_state_if_flags(struct extent_state *state,
			
 
				+				 struct extent_state **cached_ptr,
			
 
				+				 const u64 flags)
			
 
				 {
			
 
				 	if (cached_ptr && !(*cached_ptr)) {
			
 
				-		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
			
 
				+		if (!flags || (state->state & flags)) {
			
 
				 			*cached_ptr = state;
			
 
				 			atomic_inc(&state->refs);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void cache_state(struct extent_state *state,
			
 
				+			struct extent_state **cached_ptr)
			
 
				+{
			
 
				+	return cache_state_if_flags(state, cached_ptr,
			
 
				+				    EXTENT_IOBITS | EXTENT_BOUNDARY);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * set some bits on a range in the tree.  This may require allocations or
			
 
				  * sleeping, so the gfp mask is used to indicate what is allowed.
			
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
				 	int err = 0;
			
 
				 	u64 last_start;
			
 
				 	u64 last_end;
			
 
				+	bool first_iteration = true;
			
 
				 
			
 
				 	btrfs_debug_check_extent_io_range(tree, start, end);
			
 
				 
			
 
				 again:
			
 
				 	if (!prealloc && (mask & __GFP_WAIT)) {
			
 
				+		/*
			
 
				+		 * Best effort, don't worry if extent state allocation fails
			
 
				+		 * here for the first iteration. We might have a cached state
			
 
				+		 * that matches exactly the target range, in which case no
			
 
				+		 * extent state allocations are needed. We'll only know this
			
 
				+		 * after locking the tree.
			
 
				+		 */
			
 
				 		prealloc = alloc_extent_state(mask);
			
 
				-		if (!prealloc)
			
 
				+		if (!prealloc && !first_iteration)
			
 
				 			return -ENOMEM;
			
 
				 	}
			
 
				 
			
@@ -1234,6 +1255,7 @@ search_again:
 
				 	spin_unlock(&tree->lock);
			
 
				 	if (mask & __GFP_WAIT)
			
 
				 		cond_resched();
			
 
				+	first_iteration = false;
			
 
				 	goto again;
			
 
				 }
			
 
				 
			
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 
				 	state = find_first_extent_bit_state(tree, start, bits);
			
 
				 got_it:
			
 
				 	if (state) {
			
 
				-		cache_state(state, cached_state);
			
 
				+		cache_state_if_flags(state, cached_state, 0);
			
 
				 		*start_ret = state->start;
			
 
				 		*end_ret = state->end;
			
 
				 		ret = 0;
			
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 
				 	if (page_ops == 0)
			
 
				 		return 0;
			
 
				 
			
 
				+	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
			
 
				+		mapping_set_error(inode->i_mapping, -EIO);
			
 
				+
			
 
				 	while (nr_pages > 0) {
			
 
				 		ret = find_get_pages_contig(inode->i_mapping, index,
			
 
				 				     min_t(unsigned long,
			
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 
				 				clear_page_dirty_for_io(pages[i]);
			
 
				 			if (page_ops & PAGE_SET_WRITEBACK)
			
 
				 				set_page_writeback(pages[i]);
			
 
				+			if (page_ops & PAGE_SET_ERROR)
			
 
				+				SetPageError(pages[i]);
			
 
				 			if (page_ops & PAGE_END_WRITEBACK)
			
 
				 				end_page_writeback(pages[i]);
			
 
				 			if (page_ops & PAGE_UNLOCK)
			
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
 
				 #define PAGE_SET_WRITEBACK	(1 << 2)
			
 
				 #define PAGE_END_WRITEBACK	(1 << 3)
			
 
				 #define PAGE_SET_PRIVATE2	(1 << 4)
			
 
				+#define PAGE_SET_ERROR		(1 << 5)
			
 
				 
			
 
				 /*
			
 
				  * page->private values.  Every page that is controlled by the extent
			
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 
				 	if (!em)
			
 
				 		goto out;
			
 
				 
			
 
				-	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
			
 
				-		list_move(&em->list, &tree->modified_extents);
			
 
				 	em->generation = gen;
			
 
				 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				 	em->mod_start = em->start;
			
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 
				 	u64 num_bytes;
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = btrfs_start_nocow_write(root);
			
 
				+	ret = btrfs_start_write_no_snapshoting(root);
			
 
				 	if (!ret)
			
 
				 		return -ENOSPC;
			
 
				 
			
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 
				 	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
			
 
				 	if (ret <= 0) {
			
 
				 		ret = 0;
			
 
				-		btrfs_end_nocow_write(root);
			
 
				+		btrfs_end_write_no_snapshoting(root);
			
 
				 	} else {
			
 
				 		*write_bytes = min_t(size_t, *write_bytes ,
			
 
				 				     num_bytes - pos + lockstart);
			
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 				btrfs_free_reserved_data_space(inode,
			
 
				 							       reserve_bytes);
			
 
				 			else
			
 
				-				btrfs_end_nocow_write(root);
			
 
				+				btrfs_end_write_no_snapshoting(root);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -1632,7 +1632,7 @@ again:
 
				 
			
 
				 		release_bytes = 0;
			
 
				 		if (only_release_metadata)
			
 
				-			btrfs_end_nocow_write(root);
			
 
				+			btrfs_end_write_no_snapshoting(root);
			
 
				 
			
 
				 		if (only_release_metadata && copied > 0) {
			
 
				 			u64 lockstart = round_down(pos, root->sectorsize);
			
@@ -1661,7 +1661,7 @@ again:
 
				 
			
 
				 	if (release_bytes) {
			
 
				 		if (only_release_metadata) {
			
 
				-			btrfs_end_nocow_write(root);
			
 
				+			btrfs_end_write_no_snapshoting(root);
			
 
				 			btrfs_delalloc_release_metadata(inode, release_bytes);
			
 
				 		} else {
			
 
				 			btrfs_delalloc_release_space(inode, release_bytes);
			
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 
				 				    loff_t pos)
			
 
				 {
			
 
				 	struct file *file = iocb->ki_filp;
			
 
				+	struct inode *inode = file_inode(file);
			
 
				 	ssize_t written;
			
 
				 	ssize_t written_buffered;
			
 
				 	loff_t endbyte;
			
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 
				 		err = written_buffered;
			
 
				 		goto out;
			
 
				 	}
			
 
				+	/*
			
 
				+	 * Ensure all data is persisted. We want the next direct IO read to be
			
 
				+	 * able to read what was just written.
			
 
				+	 */
			
 
				 	endbyte = pos + written_buffered - 1;
			
 
				-	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
			
 
				+	err = btrfs_fdatawrite_range(inode, pos, endbyte);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
			
 
				 	if (err)
			
 
				 		goto out;
			
 
				 	written += written_buffered;
			
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
 
				 	int ret;
			
 
				 
			
 
				 	atomic_inc(&BTRFS_I(inode)->sync_writers);
			
 
				-	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
			
 
				-	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
			
 
				-			     &BTRFS_I(inode)->runtime_flags))
			
 
				-		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
			
 
				+	ret = btrfs_fdatawrite_range(inode, start, end);
			
 
				 	atomic_dec(&BTRFS_I(inode)->sync_writers);
			
 
				 
			
 
				 	return ret;
			
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * So with compression we will find and lock a dirty page and clear the
			
 
				+	 * first one as dirty, setup an async extent, and immediately return
			
 
				+	 * with the entire range locked but with nobody actually marked with
			
 
				+	 * writeback.  So we can't just filemap_write_and_wait_range() and
			
 
				+	 * expect it to work since it will just kick off a thread to do the
			
 
				+	 * actual work.  So we need to call filemap_fdatawrite_range _again_
			
 
				+	 * since it will wait on the page lock, which won't be unlocked until
			
 
				+	 * after the pages have been marked as writeback and so we're good to go
			
 
				+	 * from there.  We have to do this otherwise we'll miss the ordered
			
 
				+	 * extents and that results in badness.  Please Josef, do not think you
			
 
				+	 * know better and pull this out at some point in the future, it is
			
 
				+	 * right and you are wrong.
			
 
				+	 */
			
 
				+	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
			
 
				+	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
			
 
				+			     &BTRFS_I(inode)->runtime_flags))
			
 
				+		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
 
				 #include "disk-io.h"
			
 
				 #include "extent_io.h"
			
 
				 #include "inode-map.h"
			
 
				+#include "volumes.h"
			
 
				 
			
 
				 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
			
 
				 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
			
 
				 
			
 
				+struct btrfs_trim_range {
			
 
				+	u64 start;
			
 
				+	u64 bytes;
			
 
				+	struct list_head list;
			
 
				+};
			
 
				+
			
 
				 static int link_free_space(struct btrfs_free_space_ctl *ctl,
			
 
				 			   struct btrfs_free_space *info);
			
 
				 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
			
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
 
				 	int ret;
			
 
				 	struct btrfs_free_cluster *cluster = NULL;
			
 
				 	struct rb_node *node = rb_first(&ctl->free_space_offset);
			
 
				+	struct btrfs_trim_range *trim_entry;
			
 
				 
			
 
				 	/* Get the cluster for this block_group if it exists */
			
 
				 	if (block_group && !list_empty(&block_group->cluster_list)) {
			
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
 
				 			cluster = NULL;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure we don't miss any range that was removed from our rbtree
			
 
				+	 * because trimming is running. Otherwise after a umount+mount (or crash
			
 
				+	 * after committing the transaction) we would leak free space and get
			
 
				+	 * an inconsistent free space cache report from fsck.
			
 
				+	 */
			
 
				+	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
			
 
				+		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
			
 
				+				       trim_entry->bytes, NULL);
			
 
				+		if (ret)
			
 
				+			goto fail;
			
 
				+		*entries += 1;
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 fail:
			
 
				 	return -ENOSPC;
			
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 
			
 
				 	io_ctl_set_generation(&io_ctl, trans->transid);
			
 
				 
			
 
				+	mutex_lock(&ctl->cache_writeout_mutex);
			
 
				 	/* Write out the extent entries in the free space cache */
			
 
				 	ret = write_cache_extent_entries(&io_ctl, ctl,
			
 
				 					 block_group, &entries, &bitmaps,
			
 
				 					 &bitmap_list);
			
 
				-	if (ret)
			
 
				+	if (ret) {
			
 
				+		mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 		goto out_nospc;
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Some spaces that are freed in the current transaction are pinned,
			
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
				 	 * committed, we shouldn't lose them.
			
 
				 	 */
			
 
				 	ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
			
 
				-	if (ret)
			
 
				+	if (ret) {
			
 
				+		mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 		goto out_nospc;
			
 
				+	}
			
 
				 
			
 
				-	/* At last, we write out all the bitmaps. */
			
 
				+	/*
			
 
				+	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
			
 
				+	 * locked while doing it because a concurrent trim can be manipulating
			
 
				+	 * or freeing the bitmap.
			
 
				+	 */
			
 
				 	ret = write_bitmap_entries(&io_ctl, &bitmap_list);
			
 
				+	mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 	if (ret)
			
 
				 		goto out_nospc;
			
 
				 
			
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
 
				 	ctl->start = block_group->key.objectid;
			
 
				 	ctl->private = block_group;
			
 
				 	ctl->op = &free_space_op;
			
 
				+	INIT_LIST_HEAD(&ctl->trimming_ranges);
			
 
				+	mutex_init(&ctl->cache_writeout_mutex);
			
 
				 
			
 
				 	/*
			
 
				 	 * we only want to have 32k of ram per block group for keeping
			
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 
				 
			
 
				 static int do_trimming(struct btrfs_block_group_cache *block_group,
			
 
				 		       u64 *total_trimmed, u64 start, u64 bytes,
			
 
				-		       u64 reserved_start, u64 reserved_bytes)
			
 
				+		       u64 reserved_start, u64 reserved_bytes,
			
 
				+		       struct btrfs_trim_range *trim_entry)
			
 
				 {
			
 
				 	struct btrfs_space_info *space_info = block_group->space_info;
			
 
				 	struct btrfs_fs_info *fs_info = block_group->fs_info;
			
 
				+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
			
 
				 	int ret;
			
 
				 	int update = 0;
			
 
				 	u64 trimmed = 0;
			
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
 
				 	if (!ret)
			
 
				 		*total_trimmed += trimmed;
			
 
				 
			
 
				+	mutex_lock(&ctl->cache_writeout_mutex);
			
 
				 	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
			
 
				+	list_del(&trim_entry->list);
			
 
				+	mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 
			
 
				 	if (update) {
			
 
				 		spin_lock(&space_info->lock);
			
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 
				 	u64 bytes;
			
 
				 
			
 
				 	while (start < end) {
			
 
				+		struct btrfs_trim_range trim_entry;
			
 
				+
			
 
				+		mutex_lock(&ctl->cache_writeout_mutex);
			
 
				 		spin_lock(&ctl->tree_lock);
			
 
				 
			
 
				 		if (ctl->free_space < minlen) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				 		entry = tree_search_offset(ctl, start, 0, 1);
			
 
				 		if (!entry) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 
				 			node = rb_next(&entry->offset_index);
			
 
				 			if (!node) {
			
 
				 				spin_unlock(&ctl->tree_lock);
			
 
				+				mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 				goto out;
			
 
				 			}
			
 
				 			entry = rb_entry(node, struct btrfs_free_space,
			
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 
				 
			
 
				 		if (entry->offset >= end) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 
				 		bytes = min(extent_start + extent_bytes, end) - start;
			
 
				 		if (bytes < minlen) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			goto next;
			
 
				 		}
			
 
				 
			
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 
				 		kmem_cache_free(btrfs_free_space_cachep, entry);
			
 
				 
			
 
				 		spin_unlock(&ctl->tree_lock);
			
 
				+		trim_entry.start = extent_start;
			
 
				+		trim_entry.bytes = extent_bytes;
			
 
				+		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
			
 
				+		mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 
			
 
				 		ret = do_trimming(block_group, total_trimmed, start, bytes,
			
 
				-				  extent_start, extent_bytes);
			
 
				+				  extent_start, extent_bytes, &trim_entry);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				 next:
			
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 
				 
			
 
				 	while (offset < end) {
			
 
				 		bool next_bitmap = false;
			
 
				+		struct btrfs_trim_range trim_entry;
			
 
				 
			
 
				+		mutex_lock(&ctl->cache_writeout_mutex);
			
 
				 		spin_lock(&ctl->tree_lock);
			
 
				 
			
 
				 		if (ctl->free_space < minlen) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				 		entry = tree_search_offset(ctl, offset, 1, 0);
			
 
				 		if (!entry) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			next_bitmap = true;
			
 
				 			goto next;
			
 
				 		}
			
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 
				 		ret2 = search_bitmap(ctl, entry, &start, &bytes);
			
 
				 		if (ret2 || start >= end) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			next_bitmap = true;
			
 
				 			goto next;
			
 
				 		}
			
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 
				 		bytes = min(bytes, end - start);
			
 
				 		if (bytes < minlen) {
			
 
				 			spin_unlock(&ctl->tree_lock);
			
 
				+			mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 			goto next;
			
 
				 		}
			
 
				 
			
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 
				 			free_bitmap(ctl, entry);
			
 
				 
			
 
				 		spin_unlock(&ctl->tree_lock);
			
 
				+		trim_entry.start = start;
			
 
				+		trim_entry.bytes = bytes;
			
 
				+		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
			
 
				+		mutex_unlock(&ctl->cache_writeout_mutex);
			
 
				 
			
 
				 		ret = do_trimming(block_group, total_trimmed, start, bytes,
			
 
				-				  start, bytes);
			
 
				+				  start, bytes, &trim_entry);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				 next:
			
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 
				 
			
 
				 	*trimmed = 0;
			
 
				 
			
 
				+	spin_lock(&block_group->lock);
			
 
				+	if (block_group->removed) {
			
 
				+		spin_unlock(&block_group->lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	atomic_inc(&block_group->trimming);
			
 
				+	spin_unlock(&block_group->lock);
			
 
				+
			
 
				 	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
			
 
				 	if (ret)
			
 
				-		return ret;
			
 
				+		goto out;
			
 
				 
			
 
				 	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
			
 
				+out:
			
 
				+	spin_lock(&block_group->lock);
			
 
				+	if (atomic_dec_and_test(&block_group->trimming) &&
			
 
				+	    block_group->removed) {
			
 
				+		struct extent_map_tree *em_tree;
			
 
				+		struct extent_map *em;
			
 
				+
			
 
				+		spin_unlock(&block_group->lock);
			
 
				+
			
 
				+		em_tree = &block_group->fs_info->mapping_tree.map_tree;
			
 
				+		write_lock(&em_tree->lock);
			
 
				+		em = lookup_extent_mapping(em_tree, block_group->key.objectid,
			
 
				+					   1);
			
 
				+		BUG_ON(!em); /* logic error, can't happen */
			
 
				+		remove_extent_mapping(em_tree, em);
			
 
				+		write_unlock(&em_tree->lock);
			
 
				+
			
 
				+		lock_chunks(block_group->fs_info->chunk_root);
			
 
				+		list_del_init(&em->list);
			
 
				+		unlock_chunks(block_group->fs_info->chunk_root);
			
 
				+
			
 
				+		/* once for us and once for the tree */
			
 
				+		free_extent_map(em);
			
 
				+		free_extent_map(em);
			
 
				+
			
 
				+		/*
			
 
				+		 * We've left one free space entry and other tasks trimming
			
 
				+		 * this block group have left 1 entry each one. Free them.
			
 
				+		 */
			
 
				+		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
			
 
				+	} else {
			
 
				+		spin_unlock(&block_group->lock);
			
 
				+	}
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
 
				 	u64 start;
			
 
				 	struct btrfs_free_space_op *op;
			
 
				 	void *private;
			
 
				+	struct mutex cache_writeout_mutex;
			
 
				+	struct list_head trimming_ranges;
			
 
				 };
			
 
				 
			
 
				 struct btrfs_free_space_op {
			
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
 
				 			  root->root_key.objectid);
			
 
				 	if (IS_ERR(tsk)) {
			
 
				 		btrfs_warn(root->fs_info, "failed to start inode caching task");
			
 
				-		btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
			
 
				+		btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
			
 
				 				"disabling inode map caching");
			
 
				 	}
			
 
				 }
			
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
 
				 	ctl->start = 0;
			
 
				 	ctl->private = NULL;
			
 
				 	ctl->op = &free_ino_op;
			
 
				+	INIT_LIST_HEAD(&ctl->trimming_ranges);
			
 
				+	mutex_init(&ctl->cache_writeout_mutex);
			
 
				 
			
 
				 	/*
			
 
				 	 * Initially we allow to use 16K of ram to cache chunks of
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
 
				  * are written in the same order that the flusher thread sent them
			
 
				  * down.
			
 
				  */
			
 
				-static noinline int compress_file_range(struct inode *inode,
			
 
				+static noinline void compress_file_range(struct inode *inode,
			
 
				 					struct page *locked_page,
			
 
				 					u64 start, u64 end,
			
 
				 					struct async_cow *async_cow,
			
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
 
				 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
			
 
				 		btrfs_add_inode_defrag(NULL, inode);
			
 
				 
			
 
				-	/*
			
 
				-	 * skip compression for a small file range(<=blocksize) that
			
 
				-	 * isn't an inline extent, since it dosen't save disk space at all.
			
 
				-	 */
			
 
				-	if ((end - start + 1) <= blocksize &&
			
 
				-	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
			
 
				-		goto cleanup_and_bail_uncompressed;
			
 
				-
			
 
				 	actual_end = min_t(u64, isize, end + 1);
			
 
				 again:
			
 
				 	will_compress = 0;
			
@@ -440,6 +432,14 @@ again:
 
				 
			
 
				 	total_compressed = actual_end - start;
			
 
				 
			
 
				+	/*
			
 
				+	 * skip compression for a small file range(<=blocksize) that
			
 
				+	 * isn't an inline extent, since it dosen't save disk space at all.
			
 
				+	 */
			
 
				+	if (total_compressed <= blocksize &&
			
 
				+	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
			
 
				+		goto cleanup_and_bail_uncompressed;
			
 
				+
			
 
				 	/* we want to make sure that amount of ram required to uncompress
			
 
				 	 * an extent is reasonable, so we limit the total size in ram
			
 
				 	 * of a compressed extent to 128k.  This is a crucial number
			
@@ -527,7 +527,10 @@ cont:
 
				 		if (ret <= 0) {
			
 
				 			unsigned long clear_flags = EXTENT_DELALLOC |
			
 
				 				EXTENT_DEFRAG;
			
 
				+			unsigned long page_error_op;
			
 
				+
			
 
				 			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
			
 
				+			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
			
 
				 
			
 
				 			/*
			
 
				 			 * inline extent creation worked or returned error,
			
@@ -538,6 +541,7 @@ cont:
 
				 						     clear_flags, PAGE_UNLOCK |
			
 
				 						     PAGE_CLEAR_DIRTY |
			
 
				 						     PAGE_SET_WRITEBACK |
			
 
				+						     page_error_op |
			
 
				 						     PAGE_END_WRITEBACK);
			
 
				 			goto free_pages_out;
			
 
				 		}
			
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
 
				 		*num_added += 1;
			
 
				 	}
			
 
				 
			
 
				-out:
			
 
				-	return ret;
			
 
				+	return;
			
 
				 
			
 
				 free_pages_out:
			
 
				 	for (i = 0; i < nr_pages_ret; i++) {
			
@@ -629,8 +632,22 @@ free_pages_out:
 
				 		page_cache_release(pages[i]);
			
 
				 	}
			
 
				 	kfree(pages);
			
 
				+}
			
 
				 
			
 
				-	goto out;
			
 
				+static void free_async_extent_pages(struct async_extent *async_extent)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (!async_extent->pages)
			
 
				+		return;
			
 
				+
			
 
				+	for (i = 0; i < async_extent->nr_pages; i++) {
			
 
				+		WARN_ON(async_extent->pages[i]->mapping);
			
 
				+		page_cache_release(async_extent->pages[i]);
			
 
				+	}
			
 
				+	kfree(async_extent->pages);
			
 
				+	async_extent->nr_pages = 0;
			
 
				+	async_extent->pages = NULL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -639,7 +656,7 @@ free_pages_out:
 
				  * queued.  We walk all the async extents created by compress_file_range
			
 
				  * and send them down to the disk.
			
 
				  */
			
 
				-static noinline int submit_compressed_extents(struct inode *inode,
			
 
				+static noinline void submit_compressed_extents(struct inode *inode,
			
 
				 					      struct async_cow *async_cow)
			
 
				 {
			
 
				 	struct async_extent *async_extent;
			
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
				 	struct extent_io_tree *io_tree;
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	if (list_empty(&async_cow->extents))
			
 
				-		return 0;
			
 
				-
			
 
				 again:
			
 
				 	while (!list_empty(&async_cow->extents)) {
			
 
				 		async_extent = list_entry(async_cow->extents.next,
			
@@ -709,15 +723,7 @@ retry:
 
				 					   async_extent->compressed_size,
			
 
				 					   0, alloc_hint, &ins, 1, 1);
			
 
				 		if (ret) {
			
 
				-			int i;
			
 
				-
			
 
				-			for (i = 0; i < async_extent->nr_pages; i++) {
			
 
				-				WARN_ON(async_extent->pages[i]->mapping);
			
 
				-				page_cache_release(async_extent->pages[i]);
			
 
				-			}
			
 
				-			kfree(async_extent->pages);
			
 
				-			async_extent->nr_pages = 0;
			
 
				-			async_extent->pages = NULL;
			
 
				+			free_async_extent_pages(async_extent);
			
 
				 
			
 
				 			if (ret == -ENOSPC) {
			
 
				 				unlock_extent(io_tree, async_extent->start,
			
@@ -814,15 +820,26 @@ retry:
 
				 				    ins.objectid,
			
 
				 				    ins.offset, async_extent->pages,
			
 
				 				    async_extent->nr_pages);
			
 
				+		if (ret) {
			
 
				+			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			
 
				+			struct page *p = async_extent->pages[0];
			
 
				+			const u64 start = async_extent->start;
			
 
				+			const u64 end = start + async_extent->ram_size - 1;
			
 
				+
			
 
				+			p->mapping = inode->i_mapping;
			
 
				+			tree->ops->writepage_end_io_hook(p, start, end,
			
 
				+							 NULL, 0);
			
 
				+			p->mapping = NULL;
			
 
				+			extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
			
 
				+						     PAGE_END_WRITEBACK |
			
 
				+						     PAGE_SET_ERROR);
			
 
				+			free_async_extent_pages(async_extent);
			
 
				+		}
			
 
				 		alloc_hint = ins.objectid + ins.offset;
			
 
				 		kfree(async_extent);
			
 
				-		if (ret)
			
 
				-			goto out;
			
 
				 		cond_resched();
			
 
				 	}
			
 
				-	ret = 0;
			
 
				-out:
			
 
				-	return ret;
			
 
				+	return;
			
 
				 out_free_reserve:
			
 
				 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
			
 
				 out_free:
			
@@ -832,7 +849,9 @@ out_free:
 
				 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
			
 
				 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
			
 
				 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
			
 
				-				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
			
 
				+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
			
 
				+				     PAGE_SET_ERROR);
			
 
				+	free_async_extent_pages(async_extent);
			
 
				 	kfree(async_extent);
			
 
				 	goto again;
			
 
				 }
			
@@ -1318,7 +1337,7 @@ next_slot:
 
				 			 * we fall into common COW way.
			
 
				 			 */
			
 
				 			if (!nolock) {
			
 
				-				err = btrfs_start_nocow_write(root);
			
 
				+				err = btrfs_start_write_no_snapshoting(root);
			
 
				 				if (!err)
			
 
				 					goto out_check;
			
 
				 			}
			
@@ -1342,7 +1361,7 @@ out_check:
 
				 		if (extent_end <= start) {
			
 
				 			path->slots[0]++;
			
 
				 			if (!nolock && nocow)
			
 
				-				btrfs_end_nocow_write(root);
			
 
				+				btrfs_end_write_no_snapshoting(root);
			
 
				 			goto next_slot;
			
 
				 		}
			
 
				 		if (!nocow) {
			
@@ -1362,7 +1381,7 @@ out_check:
 
				 					     page_started, nr_written, 1);
			
 
				 			if (ret) {
			
 
				 				if (!nolock && nocow)
			
 
				-					btrfs_end_nocow_write(root);
			
 
				+					btrfs_end_write_no_snapshoting(root);
			
 
				 				goto error;
			
 
				 			}
			
 
				 			cow_start = (u64)-1;
			
@@ -1413,7 +1432,7 @@ out_check:
 
				 						      num_bytes);
			
 
				 			if (ret) {
			
 
				 				if (!nolock && nocow)
			
 
				-					btrfs_end_nocow_write(root);
			
 
				+					btrfs_end_write_no_snapshoting(root);
			
 
				 				goto error;
			
 
				 			}
			
 
				 		}
			
@@ -1424,7 +1443,7 @@ out_check:
 
				 					     EXTENT_DELALLOC, PAGE_UNLOCK |
			
 
				 					     PAGE_SET_PRIVATE2);
			
 
				 		if (!nolock && nocow)
			
 
				-			btrfs_end_nocow_write(root);
			
 
				+			btrfs_end_write_no_snapshoting(root);
			
 
				 		cur_offset = extent_end;
			
 
				 		if (cur_offset > end)
			
 
				 			break;
			
@@ -4580,6 +4599,26 @@ next:
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+static int wait_snapshoting_atomic_t(atomic_t *a)
			
 
				+{
			
 
				+	schedule();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void wait_for_snapshot_creation(struct btrfs_root *root)
			
 
				+{
			
 
				+	while (true) {
			
 
				+		int ret;
			
 
				+
			
 
				+		ret = btrfs_start_write_no_snapshoting(root);
			
 
				+		if (ret)
			
 
				+			break;
			
 
				+		wait_on_atomic_t(&root->will_be_snapshoted,
			
 
				+				 wait_snapshoting_atomic_t,
			
 
				+				 TASK_UNINTERRUPTIBLE);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
				 
			
 
				 	if (newsize > oldsize) {
			
 
				 		truncate_pagecache(inode, newsize);
			
 
				+		/*
			
 
				+		 * Don't do an expanding truncate while snapshoting is ongoing.
			
 
				+		 * This is to ensure the snapshot captures a fully consistent
			
 
				+		 * state of this file - if the snapshot captures this expanding
			
 
				+		 * truncation, it must capture all writes that happened before
			
 
				+		 * this truncation.
			
 
				+		 */
			
 
				+		wait_for_snapshot_creation(root);
			
 
				 		ret = btrfs_cont_expand(inode, oldsize, newsize);
			
 
				-		if (ret)
			
 
				+		if (ret) {
			
 
				+			btrfs_end_write_no_snapshoting(root);
			
 
				 			return ret;
			
 
				+		}
			
 
				 
			
 
				 		trans = btrfs_start_transaction(root, 1);
			
 
				-		if (IS_ERR(trans))
			
 
				+		if (IS_ERR(trans)) {
			
 
				+			btrfs_end_write_no_snapshoting(root);
			
 
				 			return PTR_ERR(trans);
			
 
				+		}
			
 
				 
			
 
				 		i_size_write(inode, newsize);
			
 
				 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
			
 
				 		ret = btrfs_update_inode(trans, root, inode);
			
 
				+		btrfs_end_write_no_snapshoting(root);
			
 
				 		btrfs_end_transaction(trans, root);
			
 
				 	} else {
			
 
				 
			
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 		} else {
			
 
				 			/* Screw you mmap */
			
 
				-			ret = filemap_write_and_wait_range(inode->i_mapping,
			
 
				-							   lockstart,
			
 
				-							   lockend);
			
 
				+			ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
			
 
				+			if (ret)
			
 
				+				break;
			
 
				+			ret = filemap_fdatawait_range(inode->i_mapping,
			
 
				+						      lockstart,
			
 
				+						      lockend);
			
 
				 			if (ret)
			
 
				 				break;
			
 
				 
			
@@ -9442,6 +9497,21 @@ out_inode:
 
				 
			
 
				 }
			
 
				 
			
 
				+/* Inspired by filemap_check_errors() */
			
 
				+int btrfs_inode_check_errors(struct inode *inode)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
			
 
				+	    test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
			
 
				+		ret = -ENOSPC;
			
 
				+	if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
			
 
				+	    test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
			
 
				+		ret = -EIO;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static const struct inode_operations btrfs_dir_inode_operations = {
			
 
				 	.getattr	= btrfs_getattr,
			
 
				 	.lookup		= btrfs_lookup,
			
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static void btrfs_wait_nocow_write(struct btrfs_root *root)
			
 
				+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
			
 
				 {
			
 
				 	s64 writers;
			
 
				 	DEFINE_WAIT(wait);
			
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 
				 
			
 
				 	atomic_inc(&root->will_be_snapshoted);
			
 
				 	smp_mb__after_atomic();
			
 
				-	btrfs_wait_nocow_write(root);
			
 
				+	btrfs_wait_for_no_snapshoting_writes(root);
			
 
				 
			
 
				 	ret = btrfs_start_delalloc_inodes(root, 0);
			
 
				 	if (ret)
			
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 
				 	if (ret)
			
 
				 		goto fail;
			
 
				 
			
 
				-	/*
			
 
				-	 * If orphan cleanup did remove any orphans, it means the tree was
			
 
				-	 * modified and therefore the commit root is not the same as the
			
 
				-	 * current root anymore. This is a problem, because send uses the
			
 
				-	 * commit root and therefore can see inode items that don't exist
			
 
				-	 * in the current root anymore, and for example make calls to
			
 
				-	 * btrfs_iget, which will do tree lookups based on the current root
			
 
				-	 * and not on the commit root. Those lookups will fail, returning a
			
 
				-	 * -ESTALE error, and making send fail with that error. So make sure
			
 
				-	 * a send does not see any orphans we have just removed, and that it
			
 
				-	 * will see the same inodes regardless of whether a transaction
			
 
				-	 * commit happened before it started (meaning that the commit root
			
 
				-	 * will be the same as the current root) or not.
			
 
				-	 */
			
 
				-	if (readonly && pending_snapshot->snap->node !=
			
 
				-	    pending_snapshot->snap->commit_root) {
			
 
				-		trans = btrfs_join_transaction(pending_snapshot->snap);
			
 
				-		if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
			
 
				-			ret = PTR_ERR(trans);
			
 
				-			goto fail;
			
 
				-		}
			
 
				-		if (!IS_ERR(trans)) {
			
 
				-			ret = btrfs_commit_transaction(trans,
			
 
				-						       pending_snapshot->snap);
			
 
				-			if (ret)
			
 
				-				goto fail;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				 	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
			
 
				 	if (IS_ERR(inode)) {
			
 
				 		ret = PTR_ERR(inode);
			
@@ -761,7 +732,8 @@ fail:
 
				 free:
			
 
				 	kfree(pending_snapshot);
			
 
				 out:
			
 
				-	atomic_dec(&root->will_be_snapshoted);
			
 
				+	if (atomic_dec_and_test(&root->will_be_snapshoted))
			
 
				+		wake_up_atomic_t(&root->will_be_snapshoted);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
				 	INIT_LIST_HEAD(&entry->work_list);
			
 
				 	init_completion(&entry->completion);
			
 
				 	INIT_LIST_HEAD(&entry->log_list);
			
 
				+	INIT_LIST_HEAD(&entry->trans_list);
			
 
				 
			
 
				 	trace_btrfs_ordered_extent_add(inode, entry);
			
 
				 
			
@@ -431,19 +432,31 @@ out:
 
				 
			
 
				 /* Needs to either be called under a log transaction or the log_mutex */
			
 
				 void btrfs_get_logged_extents(struct inode *inode,
			
 
				-			      struct list_head *logged_list)
			
 
				+			      struct list_head *logged_list,
			
 
				+			      const loff_t start,
			
 
				+			      const loff_t end)
			
 
				 {
			
 
				 	struct btrfs_ordered_inode_tree *tree;
			
 
				 	struct btrfs_ordered_extent *ordered;
			
 
				 	struct rb_node *n;
			
 
				+	struct rb_node *prev;
			
 
				 
			
 
				 	tree = &BTRFS_I(inode)->ordered_tree;
			
 
				 	spin_lock_irq(&tree->lock);
			
 
				-	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
			
 
				+	n = __tree_search(&tree->tree, end, &prev);
			
 
				+	if (!n)
			
 
				+		n = prev;
			
 
				+	for (; n; n = rb_prev(n)) {
			
 
				 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
			
 
				+		if (ordered->file_offset > end)
			
 
				+			continue;
			
 
				+		if (entry_end(ordered) <= start)
			
 
				+			break;
			
 
				 		if (!list_empty(&ordered->log_list))
			
 
				 			continue;
			
 
				-		list_add_tail(&ordered->log_list, logged_list);
			
 
				+		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			
 
				+			continue;
			
 
				+		list_add(&ordered->log_list, logged_list);
			
 
				 		atomic_inc(&ordered->refs);
			
 
				 	}
			
 
				 	spin_unlock_irq(&tree->lock);
			
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
 
				 	spin_unlock_irq(&log->log_extents_lock[index]);
			
 
				 }
			
 
				 
			
 
				-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
			
 
				+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
			
 
				+			       struct btrfs_root *log, u64 transid)
			
 
				 {
			
 
				 	struct btrfs_ordered_extent *ordered;
			
 
				 	int index = transid % 2;
			
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 
				 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
			
 
				 						   &ordered->flags));
			
 
				 
			
 
				-		btrfs_put_ordered_extent(ordered);
			
 
				+		if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			
 
				+			list_add_tail(&ordered->trans_list, &trans->ordered);
			
 
				 		spin_lock_irq(&log->log_extents_lock[index]);
			
 
				 	}
			
 
				 	spin_unlock_irq(&log->log_extents_lock[index]);
			
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 
				 	/* start IO across the range first to instantiate any delalloc
			
 
				 	 * extents
			
 
				 	 */
			
 
				-	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
			
 
				+	ret = btrfs_fdatawrite_range(inode, start, orig_end);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				-	/*
			
 
				-	 * So with compression we will find and lock a dirty page and clear the
			
 
				-	 * first one as dirty, setup an async extent, and immediately return
			
 
				-	 * with the entire range locked but with nobody actually marked with
			
 
				-	 * writeback.  So we can't just filemap_write_and_wait_range() and
			
 
				-	 * expect it to work since it will just kick off a thread to do the
			
 
				-	 * actual work.  So we need to call filemap_fdatawrite_range _again_
			
 
				-	 * since it will wait on the page lock, which won't be unlocked until
			
 
				-	 * after the pages have been marked as writeback and so we're good to go
			
 
				-	 * from there.  We have to do this otherwise we'll miss the ordered
			
 
				-	 * extents and that results in badness.  Please Josef, do not think you
			
 
				-	 * know better and pull this out at some point in the future, it is
			
 
				-	 * right and you are wrong.
			
 
				-	 */
			
 
				-	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
			
 
				-		     &BTRFS_I(inode)->runtime_flags)) {
			
 
				-		ret = filemap_fdatawrite_range(inode->i_mapping, start,
			
 
				-					       orig_end);
			
 
				-		if (ret)
			
 
				-			return ret;
			
 
				-	}
			
 
				+
			
 
				 	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
			
 
				 	if (ret)
			
 
				 		return ret;
			
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
 
				 				       ordered extent */
			
 
				 #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
			
 
				 
			
 
				+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
			
 
				+				 * in the logging code. */
			
 
				 struct btrfs_ordered_extent {
			
 
				 	/* logical offset in the file */
			
 
				 	u64 file_offset;
			
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
 
				 	/* If we need to wait on this to be done */
			
 
				 	struct list_head log_list;
			
 
				 
			
 
				+	/* If the transaction needs to wait on this ordered extent */
			
 
				+	struct list_head trans_list;
			
 
				+
			
 
				 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
			
 
				 	wait_queue_head_t wait;
			
 
				 
			
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 
				 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
			
 
				 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
			
 
				 void btrfs_get_logged_extents(struct inode *inode,
			
 
				-			      struct list_head *logged_list);
			
 
				+			      struct list_head *logged_list,
			
 
				+			      const loff_t start,
			
 
				+			      const loff_t end);
			
 
				 void btrfs_put_logged_extents(struct list_head *logged_list);
			
 
				 void btrfs_submit_logged_extents(struct list_head *logged_list,
			
 
				 				 struct btrfs_root *log);
			
 
				-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
			
 
				+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
			
 
				+			       struct btrfs_root *log, u64 transid);
			
 
				 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
			
 
				 int __init ordered_data_init(void);
			
 
				 void ordered_data_exit(void);
			
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
 
				  */
			
 
				 #define RBIO_CACHE_READY_BIT	3
			
 
				 
			
 
				+/*
			
 
				+ * bbio and raid_map is managed by the caller, so we shouldn't free
			
 
				+ * them here. And besides that, all rbios with this flag should not
			
 
				+ * be cached, because we need raid_map to check the rbios' stripe
			
 
				+ * is the same or not, but it is very likely that the caller has
			
 
				+ * free raid_map, so don't cache those rbios.
			
 
				+ */
			
 
				+#define RBIO_HOLD_BBIO_MAP_BIT	4
			
 
				 
			
 
				 #define RBIO_CACHE_SIZE 1024
			
 
				 
			
 
				+enum btrfs_rbio_ops {
			
 
				+	BTRFS_RBIO_WRITE	= 0,
			
 
				+	BTRFS_RBIO_READ_REBUILD	= 1,
			
 
				+	BTRFS_RBIO_PARITY_SCRUB	= 2,
			
 
				+};
			
 
				+
			
 
				 struct btrfs_raid_bio {
			
 
				 	struct btrfs_fs_info *fs_info;
			
 
				 	struct btrfs_bio *bbio;
			
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
 
				 	/* number of data stripes (no p/q) */
			
 
				 	int nr_data;
			
 
				 
			
 
				+	int real_stripes;
			
 
				+
			
 
				+	int stripe_npages;
			
 
				 	/*
			
 
				 	 * set if we're doing a parity rebuild
			
 
				 	 * for a read from higher up, which is handled
			
 
				 	 * differently from a parity rebuild as part of
			
 
				 	 * rmw
			
 
				 	 */
			
 
				-	int read_rebuild;
			
 
				+	enum btrfs_rbio_ops operation;
			
 
				 
			
 
				 	/* first bad stripe */
			
 
				 	int faila;
			
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
 
				 	/* second bad stripe (for raid6 use) */
			
 
				 	int failb;
			
 
				 
			
 
				+	int scrubp;
			
 
				 	/*
			
 
				 	 * number of pages needed to represent the full
			
 
				 	 * stripe
			
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
 
				 	 */
			
 
				 	int bio_list_bytes;
			
 
				 
			
 
				+	int generic_bio_cnt;
			
 
				+
			
 
				 	atomic_t refs;
			
 
				 
			
 
				+	atomic_t stripes_pending;
			
 
				+
			
 
				+	atomic_t error;
			
 
				 	/*
			
 
				 	 * these are two arrays of pointers.  We allocate the
			
 
				 	 * rbio big enough to hold them both and setup their
			
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
 
				 	 * here for faster lookup
			
 
				 	 */
			
 
				 	struct page **bio_pages;
			
 
				+
			
 
				+	/*
			
 
				+	 * bitmap to record which horizontal stripe has data
			
 
				+	 */
			
 
				+	unsigned long *dbitmap;
			
 
				 };
			
 
				 
			
 
				 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
			
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 
				 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
			
 
				 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
			
 
				 
			
 
				+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
			
 
				+					 int need_check);
			
 
				+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
			
 
				+
			
 
				 /*
			
 
				  * the stripe hash table is used for locking, and to collect
			
 
				  * bios in hopes of making a full stripe
			
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
 
				 {
			
 
				 	bio_list_merge(&dest->bio_list, &victim->bio_list);
			
 
				 	dest->bio_list_bytes += victim->bio_list_bytes;
			
 
				+	dest->generic_bio_cnt += victim->generic_bio_cnt;
			
 
				 	bio_list_init(&victim->bio_list);
			
 
				 }
			
 
				 
			
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 
				 	    cur->raid_map[0])
			
 
				 		return 0;
			
 
				 
			
 
				-	/* reads can't merge with writes */
			
 
				-	if (last->read_rebuild !=
			
 
				-	    cur->read_rebuild) {
			
 
				+	/* we can't merge with different operations */
			
 
				+	if (last->operation != cur->operation)
			
 
				+		return 0;
			
 
				+	/*
			
 
				+	 * We've need read the full stripe from the drive.
			
 
				+	 * check and repair the parity and write the new results.
			
 
				+	 *
			
 
				+	 * We're not allowed to add any new bios to the
			
 
				+	 * bio list here, anyone else that wants to
			
 
				+	 * change this stripe needs to do their own rmw.
			
 
				+	 */
			
 
				+	if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
			
 
				+	    cur->operation == BTRFS_RBIO_PARITY_SCRUB)
			
 
				 		return 0;
			
 
				-	}
			
 
				 
			
 
				 	return 1;
			
 
				 }
			
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 
				  */
			
 
				 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
			
 
				 {
			
 
				-	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
			
 
				+	if (rbio->nr_data + 1 == rbio->real_stripes)
			
 
				 		return NULL;
			
 
				 
			
 
				 	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
			
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 
				 			spin_unlock(&rbio->bio_list_lock);
			
 
				 			spin_unlock_irqrestore(&h->lock, flags);
			
 
				 
			
 
				-			if (next->read_rebuild)
			
 
				+			if (next->operation == BTRFS_RBIO_READ_REBUILD)
			
 
				 				async_read_rebuild(next);
			
 
				-			else {
			
 
				+			else if (next->operation == BTRFS_RBIO_WRITE) {
			
 
				 				steal_rbio(rbio, next);
			
 
				 				async_rmw_stripe(next);
			
 
				+			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
			
 
				+				steal_rbio(rbio, next);
			
 
				+				async_scrub_parity(next);
			
 
				 			}
			
 
				 
			
 
				 			goto done_nolock;
			
@@ -796,6 +841,21 @@ done_nolock:
 
				 		remove_rbio_from_cache(rbio);
			
 
				 }
			
 
				 
			
 
				+static inline void
			
 
				+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
			
 
				+{
			
 
				+	if (need) {
			
 
				+		kfree(raid_map);
			
 
				+		kfree(bbio);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
			
 
				+			!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
			
 
				+}
			
 
				+
			
 
				 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
			
 
				 {
			
 
				 	int i;
			
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 
				 			rbio->stripe_pages[i] = NULL;
			
 
				 		}
			
 
				 	}
			
 
				-	kfree(rbio->raid_map);
			
 
				-	kfree(rbio->bbio);
			
 
				+
			
 
				+	free_bbio_and_raid_map(rbio);
			
 
				+
			
 
				 	kfree(rbio);
			
 
				 }
			
 
				 
			
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 
				 {
			
 
				 	struct bio *cur = bio_list_get(&rbio->bio_list);
			
 
				 	struct bio *next;
			
 
				+
			
 
				+	if (rbio->generic_bio_cnt)
			
 
				+		btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
			
 
				+
			
 
				 	free_raid_bio(rbio);
			
 
				 
			
 
				 	while (cur) {
			
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
 
				 
			
 
				 	bio_put(bio);
			
 
				 
			
 
				-	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
			
 
				+	if (!atomic_dec_and_test(&rbio->stripes_pending))
			
 
				 		return;
			
 
				 
			
 
				 	err = 0;
			
 
				 
			
 
				 	/* OK, we have read all the stripes we need to. */
			
 
				-	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
			
 
				+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
			
 
				 		err = -EIO;
			
 
				 
			
 
				 	rbio_orig_end_io(rbio, err, 0);
			
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
				 {
			
 
				 	struct btrfs_raid_bio *rbio;
			
 
				 	int nr_data = 0;
			
 
				-	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
			
 
				+	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
			
 
				+	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
			
 
				+	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
			
 
				 	void *p;
			
 
				 
			
 
				-	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
			
 
				+	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
			
 
				+		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
			
 
				 			GFP_NOFS);
			
 
				-	if (!rbio) {
			
 
				-		kfree(raid_map);
			
 
				-		kfree(bbio);
			
 
				+	if (!rbio)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				-	}
			
 
				 
			
 
				 	bio_list_init(&rbio->bio_list);
			
 
				 	INIT_LIST_HEAD(&rbio->plug_list);
			
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
				 	rbio->fs_info = root->fs_info;
			
 
				 	rbio->stripe_len = stripe_len;
			
 
				 	rbio->nr_pages = num_pages;
			
 
				+	rbio->real_stripes = real_stripes;
			
 
				+	rbio->stripe_npages = stripe_npages;
			
 
				 	rbio->faila = -1;
			
 
				 	rbio->failb = -1;
			
 
				 	atomic_set(&rbio->refs, 1);
			
 
				+	atomic_set(&rbio->error, 0);
			
 
				+	atomic_set(&rbio->stripes_pending, 0);
			
 
				 
			
 
				 	/*
			
 
				 	 * the stripe_pages and bio_pages array point to the extra
			
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
				 	p = rbio + 1;
			
 
				 	rbio->stripe_pages = p;
			
 
				 	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
			
 
				+	rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
			
 
				 
			
 
				-	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
			
 
				-		nr_data = bbio->num_stripes - 2;
			
 
				+	if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
			
 
				+		nr_data = real_stripes - 2;
			
 
				 	else
			
 
				-		nr_data = bbio->num_stripes - 1;
			
 
				+		nr_data = real_stripes - 1;
			
 
				 
			
 
				 	rbio->nr_data = nr_data;
			
 
				 	return rbio;
			
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 
				 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
			
 
				 {
			
 
				 	if (rbio->faila >= 0 || rbio->failb >= 0) {
			
 
				-		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
			
 
				+		BUG_ON(rbio->faila == rbio->real_stripes - 1);
			
 
				 		__raid56_parity_recover(rbio);
			
 
				 	} else {
			
 
				 		finish_rmw(rbio);
			
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 
				 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
			
 
				 {
			
 
				 	struct btrfs_bio *bbio = rbio->bbio;
			
 
				-	void *pointers[bbio->num_stripes];
			
 
				+	void *pointers[rbio->real_stripes];
			
 
				 	int stripe_len = rbio->stripe_len;
			
 
				 	int nr_data = rbio->nr_data;
			
 
				 	int stripe;
			
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 
			
 
				 	bio_list_init(&bio_list);
			
 
				 
			
 
				-	if (bbio->num_stripes - rbio->nr_data == 1) {
			
 
				-		p_stripe = bbio->num_stripes - 1;
			
 
				-	} else if (bbio->num_stripes - rbio->nr_data == 2) {
			
 
				-		p_stripe = bbio->num_stripes - 2;
			
 
				-		q_stripe = bbio->num_stripes - 1;
			
 
				+	if (rbio->real_stripes - rbio->nr_data == 1) {
			
 
				+		p_stripe = rbio->real_stripes - 1;
			
 
				+	} else if (rbio->real_stripes - rbio->nr_data == 2) {
			
 
				+		p_stripe = rbio->real_stripes - 2;
			
 
				+		q_stripe = rbio->real_stripes - 1;
			
 
				 	} else {
			
 
				 		BUG();
			
 
				 	}
			
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
			
 
				 	spin_unlock_irq(&rbio->bio_list_lock);
			
 
				 
			
 
				-	atomic_set(&rbio->bbio->error, 0);
			
 
				+	atomic_set(&rbio->error, 0);
			
 
				 
			
 
				 	/*
			
 
				 	 * now that we've set rmw_locked, run through the
			
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 			SetPageUptodate(p);
			
 
				 			pointers[stripe++] = kmap(p);
			
 
				 
			
 
				-			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
			
 
				+			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
			
 
				 						pointers);
			
 
				 		} else {
			
 
				 			/* raid5 */
			
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 		}
			
 
				 
			
 
				 
			
 
				-		for (stripe = 0; stripe < bbio->num_stripes; stripe++)
			
 
				+		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
			
 
				 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
			
 
				 	}
			
 
				 
			
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
			
 
				 	 * everything else.
			
 
				 	 */
			
 
				-	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
			
 
				+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
			
 
				 		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
			
 
				 			struct page *page;
			
 
				 			if (stripe < rbio->nr_data) {
			
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
			
 
				-	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
			
 
				+	if (likely(!bbio->num_tgtdevs))
			
 
				+		goto write_data;
			
 
				+
			
 
				+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
			
 
				+		if (!bbio->tgtdev_map[stripe])
			
 
				+			continue;
			
 
				+
			
 
				+		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
			
 
				+			struct page *page;
			
 
				+			if (stripe < rbio->nr_data) {
			
 
				+				page = page_in_rbio(rbio, stripe, pagenr, 1);
			
 
				+				if (!page)
			
 
				+					continue;
			
 
				+			} else {
			
 
				+			       page = rbio_stripe_page(rbio, stripe, pagenr);
			
 
				+			}
			
 
				+
			
 
				+			ret = rbio_add_io_page(rbio, &bio_list, page,
			
 
				+					       rbio->bbio->tgtdev_map[stripe],
			
 
				+					       pagenr, rbio->stripe_len);
			
 
				+			if (ret)
			
 
				+				goto cleanup;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+write_data:
			
 
				+	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
			
 
				+	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
			
 
				 
			
 
				 	while (1) {
			
 
				 		bio = bio_list_pop(&bio_list);
			
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 
				 		stripe = &rbio->bbio->stripes[i];
			
 
				 		stripe_start = stripe->physical;
			
 
				 		if (physical >= stripe_start &&
			
 
				-		    physical < stripe_start + rbio->stripe_len) {
			
 
				+		    physical < stripe_start + rbio->stripe_len &&
			
 
				+		    bio->bi_bdev == stripe->dev->bdev) {
			
 
				 			return i;
			
 
				 		}
			
 
				 	}
			
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
 
				 	if (rbio->faila == -1) {
			
 
				 		/* first failure on this rbio */
			
 
				 		rbio->faila = failed;
			
 
				-		atomic_inc(&rbio->bbio->error);
			
 
				+		atomic_inc(&rbio->error);
			
 
				 	} else if (rbio->failb == -1) {
			
 
				 		/* second failure on this rbio */
			
 
				 		rbio->failb = failed;
			
 
				-		atomic_inc(&rbio->bbio->error);
			
 
				+		atomic_inc(&rbio->error);
			
 
				 	} else {
			
 
				 		ret = -EIO;
			
 
				 	}
			
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
 
				 
			
 
				 	bio_put(bio);
			
 
				 
			
 
				-	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
			
 
				+	if (!atomic_dec_and_test(&rbio->stripes_pending))
			
 
				 		return;
			
 
				 
			
 
				 	err = 0;
			
 
				-	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
			
 
				+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
			
 
				 		goto cleanup;
			
 
				 
			
 
				 	/*
			
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 
				 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
			
 
				 {
			
 
				 	int bios_to_read = 0;
			
 
				-	struct btrfs_bio *bbio = rbio->bbio;
			
 
				 	struct bio_list bio_list;
			
 
				 	int ret;
			
 
				 	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
			
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 
				 
			
 
				 	index_rbio_pages(rbio);
			
 
				 
			
 
				-	atomic_set(&rbio->bbio->error, 0);
			
 
				+	atomic_set(&rbio->error, 0);
			
 
				 	/*
			
 
				 	 * build a list of bios to read all the missing parts of this
			
 
				 	 * stripe
			
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 
				 	 * the bbio may be freed once we submit the last bio.  Make sure
			
 
				 	 * not to touch it after that
			
 
				 	 */
			
 
				-	atomic_set(&bbio->stripes_pending, bios_to_read);
			
 
				+	atomic_set(&rbio->stripes_pending, bios_to_read);
			
 
				 	while (1) {
			
 
				 		bio = bio_list_pop(&bio_list);
			
 
				 		if (!bio)
			
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 
				 	struct btrfs_raid_bio *rbio;
			
 
				 	struct btrfs_plug_cb *plug = NULL;
			
 
				 	struct blk_plug_cb *cb;
			
 
				+	int ret;
			
 
				 
			
 
				 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
			
 
				-	if (IS_ERR(rbio))
			
 
				+	if (IS_ERR(rbio)) {
			
 
				+		__free_bbio_and_raid_map(bbio, raid_map, 1);
			
 
				 		return PTR_ERR(rbio);
			
 
				+	}
			
 
				 	bio_list_add(&rbio->bio_list, bio);
			
 
				 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
			
 
				+	rbio->operation = BTRFS_RBIO_WRITE;
			
 
				+
			
 
				+	btrfs_bio_counter_inc_noblocked(root->fs_info);
			
 
				+	rbio->generic_bio_cnt = 1;
			
 
				 
			
 
				 	/*
			
 
				 	 * don't plug on full rbios, just get them out the door
			
 
				 	 * as quickly as we can
			
 
				 	 */
			
 
				-	if (rbio_is_full(rbio))
			
 
				-		return full_stripe_write(rbio);
			
 
				+	if (rbio_is_full(rbio)) {
			
 
				+		ret = full_stripe_write(rbio);
			
 
				+		if (ret)
			
 
				+			btrfs_bio_counter_dec(root->fs_info);
			
 
				+		return ret;
			
 
				+	}
			
 
				 
			
 
				 	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
			
 
				 			       sizeof(*plug));
			
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 
				 			INIT_LIST_HEAD(&plug->rbio_list);
			
 
				 		}
			
 
				 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
			
 
				+		ret = 0;
			
 
				 	} else {
			
 
				-		return __raid56_parity_write(rbio);
			
 
				+		ret = __raid56_parity_write(rbio);
			
 
				+		if (ret)
			
 
				+			btrfs_bio_counter_dec(root->fs_info);
			
 
				 	}
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
				 	int err;
			
 
				 	int i;
			
 
				 
			
 
				-	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
			
 
				+	pointers = kzalloc(rbio->real_stripes * sizeof(void *),
			
 
				 			   GFP_NOFS);
			
 
				 	if (!pointers) {
			
 
				 		err = -ENOMEM;
			
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
				 	faila = rbio->faila;
			
 
				 	failb = rbio->failb;
			
 
				 
			
 
				-	if (rbio->read_rebuild) {
			
 
				+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
			
 
				 		spin_lock_irq(&rbio->bio_list_lock);
			
 
				 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
			
 
				 		spin_unlock_irq(&rbio->bio_list_lock);
			
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
				 	index_rbio_pages(rbio);
			
 
				 
			
 
				 	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
			
 
				+		/*
			
 
				+		 * Now we just use bitmap to mark the horizontal stripes in
			
 
				+		 * which we have data when doing parity scrub.
			
 
				+		 */
			
 
				+		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
			
 
				+		    !test_bit(pagenr, rbio->dbitmap))
			
 
				+			continue;
			
 
				+
			
 
				 		/* setup our array of pointers with pages
			
 
				 		 * from each stripe
			
 
				 		 */
			
 
				-		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
			
 
				+		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
			
 
				 			/*
			
 
				 			 * if we're rebuilding a read, we have to use
			
 
				 			 * pages from the bio list
			
 
				 			 */
			
 
				-			if (rbio->read_rebuild &&
			
 
				+			if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
			
 
				 			    (stripe == faila || stripe == failb)) {
			
 
				 				page = page_in_rbio(rbio, stripe, pagenr, 0);
			
 
				 			} else {
			
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
				 		}
			
 
				 
			
 
				 		/* all raid6 handling here */
			
 
				-		if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
			
 
				+		if (rbio->raid_map[rbio->real_stripes - 1] ==
			
 
				 		    RAID6_Q_STRIPE) {
			
 
				 
			
 
				 			/*
			
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
				 			}
			
 
				 
			
 
				 			if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
			
 
				-				raid6_datap_recov(rbio->bbio->num_stripes,
			
 
				+				raid6_datap_recov(rbio->real_stripes,
			
 
				 						  PAGE_SIZE, faila, pointers);
			
 
				 			} else {
			
 
				-				raid6_2data_recov(rbio->bbio->num_stripes,
			
 
				+				raid6_2data_recov(rbio->real_stripes,
			
 
				 						  PAGE_SIZE, faila, failb,
			
 
				 						  pointers);
			
 
				 			}
			
@@ -1850,7 +1968,7 @@ pstripe:
 
				 		 * know they can be trusted.  If this was a read reconstruction,
			
 
				 		 * other endio functions will fiddle the uptodate bits
			
 
				 		 */
			
 
				-		if (!rbio->read_rebuild) {
			
 
				+		if (rbio->operation == BTRFS_RBIO_WRITE) {
			
 
				 			for (i = 0;  i < nr_pages; i++) {
			
 
				 				if (faila != -1) {
			
 
				 					page = rbio_stripe_page(rbio, faila, i);
			
@@ -1862,12 +1980,12 @@ pstripe:
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				-		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
			
 
				+		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
			
 
				 			/*
			
 
				 			 * if we're rebuilding a read, we have to use
			
 
				 			 * pages from the bio list
			
 
				 			 */
			
 
				-			if (rbio->read_rebuild &&
			
 
				+			if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
			
 
				 			    (stripe == faila || stripe == failb)) {
			
 
				 				page = page_in_rbio(rbio, stripe, pagenr, 0);
			
 
				 			} else {
			
@@ -1882,9 +2000,9 @@ cleanup:
 
				 	kfree(pointers);
			
 
				 
			
 
				 cleanup_io:
			
 
				-
			
 
				-	if (rbio->read_rebuild) {
			
 
				-		if (err == 0)
			
 
				+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
			
 
				+		if (err == 0 &&
			
 
				+		    !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
			
 
				 			cache_rbio_pages(rbio);
			
 
				 		else
			
 
				 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
			
@@ -1893,7 +2011,13 @@ cleanup_io:
 
				 	} else if (err == 0) {
			
 
				 		rbio->faila = -1;
			
 
				 		rbio->failb = -1;
			
 
				-		finish_rmw(rbio);
			
 
				+
			
 
				+		if (rbio->operation == BTRFS_RBIO_WRITE)
			
 
				+			finish_rmw(rbio);
			
 
				+		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
			
 
				+			finish_parity_scrub(rbio, 0);
			
 
				+		else
			
 
				+			BUG();
			
 
				 	} else {
			
 
				 		rbio_orig_end_io(rbio, err, 0);
			
 
				 	}
			
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
 
				 		set_bio_pages_uptodate(bio);
			
 
				 	bio_put(bio);
			
 
				 
			
 
				-	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
			
 
				+	if (!atomic_dec_and_test(&rbio->stripes_pending))
			
 
				 		return;
			
 
				 
			
 
				-	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
			
 
				+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
			
 
				 		rbio_orig_end_io(rbio, -EIO, 0);
			
 
				 	else
			
 
				 		__raid_recover_end_io(rbio);
			
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
 
				 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
			
 
				 {
			
 
				 	int bios_to_read = 0;
			
 
				-	struct btrfs_bio *bbio = rbio->bbio;
			
 
				 	struct bio_list bio_list;
			
 
				 	int ret;
			
 
				 	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
			
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 
				 	if (ret)
			
 
				 		goto cleanup;
			
 
				 
			
 
				-	atomic_set(&rbio->bbio->error, 0);
			
 
				+	atomic_set(&rbio->error, 0);
			
 
				 
			
 
				 	/*
			
 
				 	 * read everything that hasn't failed.  Thanks to the
			
 
				 	 * stripe cache, it is possible that some or all of these
			
 
				 	 * pages are going to be uptodate.
			
 
				 	 */
			
 
				-	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
			
 
				+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
			
 
				 		if (rbio->faila == stripe || rbio->failb == stripe) {
			
 
				-			atomic_inc(&rbio->bbio->error);
			
 
				+			atomic_inc(&rbio->error);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 
				 		 * were up to date, or we might have no bios to read because
			
 
				 		 * the devices were gone.
			
 
				 		 */
			
 
				-		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
			
 
				+		if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
			
 
				 			__raid_recover_end_io(rbio);
			
 
				 			goto out;
			
 
				 		} else {
			
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 
				 	 * the bbio may be freed once we submit the last bio.  Make sure
			
 
				 	 * not to touch it after that
			
 
				 	 */
			
 
				-	atomic_set(&bbio->stripes_pending, bios_to_read);
			
 
				+	atomic_set(&rbio->stripes_pending, bios_to_read);
			
 
				 	while (1) {
			
 
				 		bio = bio_list_pop(&bio_list);
			
 
				 		if (!bio)
			
@@ -2021,7 +2144,7 @@ out:
 
				 	return 0;
			
 
				 
			
 
				 cleanup:
			
 
				-	if (rbio->read_rebuild)
			
 
				+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
			
 
				 		rbio_orig_end_io(rbio, -EIO, 0);
			
 
				 	return -EIO;
			
 
				 }
			
@@ -2034,34 +2157,42 @@ cleanup:
 
				  */
			
 
				 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
			
 
				 			  struct btrfs_bio *bbio, u64 *raid_map,
			
 
				-			  u64 stripe_len, int mirror_num)
			
 
				+			  u64 stripe_len, int mirror_num, int generic_io)
			
 
				 {
			
 
				 	struct btrfs_raid_bio *rbio;
			
 
				 	int ret;
			
 
				 
			
 
				 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
			
 
				-	if (IS_ERR(rbio))
			
 
				+	if (IS_ERR(rbio)) {
			
 
				+		__free_bbio_and_raid_map(bbio, raid_map, generic_io);
			
 
				 		return PTR_ERR(rbio);
			
 
				+	}
			
 
				 
			
 
				-	rbio->read_rebuild = 1;
			
 
				+	rbio->operation = BTRFS_RBIO_READ_REBUILD;
			
 
				 	bio_list_add(&rbio->bio_list, bio);
			
 
				 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
			
 
				 
			
 
				 	rbio->faila = find_logical_bio_stripe(rbio, bio);
			
 
				 	if (rbio->faila == -1) {
			
 
				 		BUG();
			
 
				-		kfree(raid_map);
			
 
				-		kfree(bbio);
			
 
				+		__free_bbio_and_raid_map(bbio, raid_map, generic_io);
			
 
				 		kfree(rbio);
			
 
				 		return -EIO;
			
 
				 	}
			
 
				 
			
 
				+	if (generic_io) {
			
 
				+		btrfs_bio_counter_inc_noblocked(root->fs_info);
			
 
				+		rbio->generic_bio_cnt = 1;
			
 
				+	} else {
			
 
				+		set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * reconstruct from the q stripe if they are
			
 
				 	 * asking for mirror 3
			
 
				 	 */
			
 
				 	if (mirror_num == 3)
			
 
				-		rbio->failb = bbio->num_stripes - 2;
			
 
				+		rbio->failb = rbio->real_stripes - 2;
			
 
				 
			
 
				 	ret = lock_stripe_add(rbio);
			
 
				 
			
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
 
				 	rbio = container_of(work, struct btrfs_raid_bio, work);
			
 
				 	__raid56_parity_recover(rbio);
			
 
				 }
			
 
				+
			
 
				+/*
			
 
				+ * The following code is used to scrub/replace the parity stripe
			
 
				+ *
			
 
				+ * Note: We need make sure all the pages that add into the scrub/replace
			
 
				+ * raid bio are correct and not be changed during the scrub/replace. That
			
 
				+ * is those pages just hold metadata or file data with checksum.
			
 
				+ */
			
 
				+
			
 
				+struct btrfs_raid_bio *
			
 
				+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
			
 
				+			       struct btrfs_bio *bbio, u64 *raid_map,
			
 
				+			       u64 stripe_len, struct btrfs_device *scrub_dev,
			
 
				+			       unsigned long *dbitmap, int stripe_nsectors)
			
 
				+{
			
 
				+	struct btrfs_raid_bio *rbio;
			
 
				+	int i;
			
 
				+
			
 
				+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
			
 
				+	if (IS_ERR(rbio))
			
 
				+		return NULL;
			
 
				+	bio_list_add(&rbio->bio_list, bio);
			
 
				+	/*
			
 
				+	 * This is a special bio which is used to hold the completion handler
			
 
				+	 * and make the scrub rbio is similar to the other types
			
 
				+	 */
			
 
				+	ASSERT(!bio->bi_iter.bi_size);
			
 
				+	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
			
 
				+
			
 
				+	for (i = 0; i < rbio->real_stripes; i++) {
			
 
				+		if (bbio->stripes[i].dev == scrub_dev) {
			
 
				+			rbio->scrubp = i;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Now we just support the sectorsize equals to page size */
			
 
				+	ASSERT(root->sectorsize == PAGE_SIZE);
			
 
				+	ASSERT(rbio->stripe_npages == stripe_nsectors);
			
 
				+	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
			
 
				+
			
 
				+	return rbio;
			
 
				+}
			
 
				+
			
 
				+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
			
 
				+				   struct page *page, u64 logical)
			
 
				+{
			
 
				+	int stripe_offset;
			
 
				+	int index;
			
 
				+
			
 
				+	ASSERT(logical >= rbio->raid_map[0]);
			
 
				+	ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
			
 
				+				rbio->stripe_len * rbio->nr_data);
			
 
				+	stripe_offset = (int)(logical - rbio->raid_map[0]);
			
 
				+	index = stripe_offset >> PAGE_CACHE_SHIFT;
			
 
				+	rbio->bio_pages[index] = page;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We just scrub the parity that we have correct data on the same horizontal,
			
 
				+ * so we needn't allocate all pages for all the stripes.
			
 
				+ */
			
 
				+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	int i;
			
 
				+	int bit;
			
 
				+	int index;
			
 
				+	struct page *page;
			
 
				+
			
 
				+	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
			
 
				+		for (i = 0; i < rbio->real_stripes; i++) {
			
 
				+			index = i * rbio->stripe_npages + bit;
			
 
				+			if (rbio->stripe_pages[index])
			
 
				+				continue;
			
 
				+
			
 
				+			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
			
 
				+			if (!page)
			
 
				+				return -ENOMEM;
			
 
				+			rbio->stripe_pages[index] = page;
			
 
				+			ClearPageUptodate(page);
			
 
				+		}
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * end io function used by finish_rmw.  When we finally
			
 
				+ * get here, we've written a full stripe
			
 
				+ */
			
 
				+static void raid_write_parity_end_io(struct bio *bio, int err)
			
 
				+{
			
 
				+	struct btrfs_raid_bio *rbio = bio->bi_private;
			
 
				+
			
 
				+	if (err)
			
 
				+		fail_bio_stripe(rbio, bio);
			
 
				+
			
 
				+	bio_put(bio);
			
 
				+
			
 
				+	if (!atomic_dec_and_test(&rbio->stripes_pending))
			
 
				+		return;
			
 
				+
			
 
				+	err = 0;
			
 
				+
			
 
				+	if (atomic_read(&rbio->error))
			
 
				+		err = -EIO;
			
 
				+
			
 
				+	rbio_orig_end_io(rbio, err, 0);
			
 
				+}
			
 
				+
			
 
				+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
			
 
				+					 int need_check)
			
 
				+{
			
 
				+	struct btrfs_bio *bbio = rbio->bbio;
			
 
				+	void *pointers[rbio->real_stripes];
			
 
				+	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
			
 
				+	int nr_data = rbio->nr_data;
			
 
				+	int stripe;
			
 
				+	int pagenr;
			
 
				+	int p_stripe = -1;
			
 
				+	int q_stripe = -1;
			
 
				+	struct page *p_page = NULL;
			
 
				+	struct page *q_page = NULL;
			
 
				+	struct bio_list bio_list;
			
 
				+	struct bio *bio;
			
 
				+	int is_replace = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	bio_list_init(&bio_list);
			
 
				+
			
 
				+	if (rbio->real_stripes - rbio->nr_data == 1) {
			
 
				+		p_stripe = rbio->real_stripes - 1;
			
 
				+	} else if (rbio->real_stripes - rbio->nr_data == 2) {
			
 
				+		p_stripe = rbio->real_stripes - 2;
			
 
				+		q_stripe = rbio->real_stripes - 1;
			
 
				+	} else {
			
 
				+		BUG();
			
 
				+	}
			
 
				+
			
 
				+	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
			
 
				+		is_replace = 1;
			
 
				+		bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Because the higher layers(scrubber) are unlikely to
			
 
				+	 * use this area of the disk again soon, so don't cache
			
 
				+	 * it.
			
 
				+	 */
			
 
				+	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
			
 
				+
			
 
				+	if (!need_check)
			
 
				+		goto writeback;
			
 
				+
			
 
				+	p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
			
 
				+	if (!p_page)
			
 
				+		goto cleanup;
			
 
				+	SetPageUptodate(p_page);
			
 
				+
			
 
				+	if (q_stripe != -1) {
			
 
				+		q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
			
 
				+		if (!q_page) {
			
 
				+			__free_page(p_page);
			
 
				+			goto cleanup;
			
 
				+		}
			
 
				+		SetPageUptodate(q_page);
			
 
				+	}
			
 
				+
			
 
				+	atomic_set(&rbio->error, 0);
			
 
				+
			
 
				+	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
			
 
				+		struct page *p;
			
 
				+		void *parity;
			
 
				+		/* first collect one page from each data stripe */
			
 
				+		for (stripe = 0; stripe < nr_data; stripe++) {
			
 
				+			p = page_in_rbio(rbio, stripe, pagenr, 0);
			
 
				+			pointers[stripe] = kmap(p);
			
 
				+		}
			
 
				+
			
 
				+		/* then add the parity stripe */
			
 
				+		pointers[stripe++] = kmap(p_page);
			
 
				+
			
 
				+		if (q_stripe != -1) {
			
 
				+
			
 
				+			/*
			
 
				+			 * raid6, add the qstripe and call the
			
 
				+			 * library function to fill in our p/q
			
 
				+			 */
			
 
				+			pointers[stripe++] = kmap(q_page);
			
 
				+
			
 
				+			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
			
 
				+						pointers);
			
 
				+		} else {
			
 
				+			/* raid5 */
			
 
				+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
			
 
				+			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
			
 
				+		}
			
 
				+
			
 
				+		/* Check scrubbing pairty and repair it */
			
 
				+		p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
			
 
				+		parity = kmap(p);
			
 
				+		if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
			
 
				+			memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
			
 
				+		else
			
 
				+			/* Parity is right, needn't writeback */
			
 
				+			bitmap_clear(rbio->dbitmap, pagenr, 1);
			
 
				+		kunmap(p);
			
 
				+
			
 
				+		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
			
 
				+			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
			
 
				+	}
			
 
				+
			
 
				+	__free_page(p_page);
			
 
				+	if (q_page)
			
 
				+		__free_page(q_page);
			
 
				+
			
 
				+writeback:
			
 
				+	/*
			
 
				+	 * time to start writing.  Make bios for everything from the
			
 
				+	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
			
 
				+	 * everything else.
			
 
				+	 */
			
 
				+	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
			
 
				+		struct page *page;
			
 
				+
			
 
				+		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
			
 
				+		ret = rbio_add_io_page(rbio, &bio_list,
			
 
				+			       page, rbio->scrubp, pagenr, rbio->stripe_len);
			
 
				+		if (ret)
			
 
				+			goto cleanup;
			
 
				+	}
			
 
				+
			
 
				+	if (!is_replace)
			
 
				+		goto submit_write;
			
 
				+
			
 
				+	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
			
 
				+		struct page *page;
			
 
				+
			
 
				+		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
			
 
				+		ret = rbio_add_io_page(rbio, &bio_list, page,
			
 
				+				       bbio->tgtdev_map[rbio->scrubp],
			
 
				+				       pagenr, rbio->stripe_len);
			
 
				+		if (ret)
			
 
				+			goto cleanup;
			
 
				+	}
			
 
				+
			
 
				+submit_write:
			
 
				+	nr_data = bio_list_size(&bio_list);
			
 
				+	if (!nr_data) {
			
 
				+		/* Every parity is right */
			
 
				+		rbio_orig_end_io(rbio, 0, 0);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	atomic_set(&rbio->stripes_pending, nr_data);
			
 
				+
			
 
				+	while (1) {
			
 
				+		bio = bio_list_pop(&bio_list);
			
 
				+		if (!bio)
			
 
				+			break;
			
 
				+
			
 
				+		bio->bi_private = rbio;
			
 
				+		bio->bi_end_io = raid_write_parity_end_io;
			
 
				+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
			
 
				+		submit_bio(WRITE, bio);
			
 
				+	}
			
 
				+	return;
			
 
				+
			
 
				+cleanup:
			
 
				+	rbio_orig_end_io(rbio, -EIO, 0);
			
 
				+}
			
 
				+
			
 
				+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
			
 
				+{
			
 
				+	if (stripe >= 0 && stripe < rbio->nr_data)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * While we're doing the parity check and repair, we could have errors
			
 
				+ * in reading pages off the disk.  This checks for errors and if we're
			
 
				+ * not able to read the page it'll trigger parity reconstruction.  The
			
 
				+ * parity scrub will be finished after we've reconstructed the failed
			
 
				+ * stripes
			
 
				+ */
			
 
				+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
			
 
				+		goto cleanup;
			
 
				+
			
 
				+	if (rbio->faila >= 0 || rbio->failb >= 0) {
			
 
				+		int dfail = 0, failp = -1;
			
 
				+
			
 
				+		if (is_data_stripe(rbio, rbio->faila))
			
 
				+			dfail++;
			
 
				+		else if (is_parity_stripe(rbio->faila))
			
 
				+			failp = rbio->faila;
			
 
				+
			
 
				+		if (is_data_stripe(rbio, rbio->failb))
			
 
				+			dfail++;
			
 
				+		else if (is_parity_stripe(rbio->failb))
			
 
				+			failp = rbio->failb;
			
 
				+
			
 
				+		/*
			
 
				+		 * Because we can not use a scrubbing parity to repair
			
 
				+		 * the data, so the capability of the repair is declined.
			
 
				+		 * (In the case of RAID5, we can not repair anything)
			
 
				+		 */
			
 
				+		if (dfail > rbio->bbio->max_errors - 1)
			
 
				+			goto cleanup;
			
 
				+
			
 
				+		/*
			
 
				+		 * If all data is good, only parity is correctly, just
			
 
				+		 * repair the parity.
			
 
				+		 */
			
 
				+		if (dfail == 0) {
			
 
				+			finish_parity_scrub(rbio, 0);
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Here means we got one corrupted data stripe and one
			
 
				+		 * corrupted parity on RAID6, if the corrupted parity
			
 
				+		 * is scrubbing parity, luckly, use the other one to repair
			
 
				+		 * the data, or we can not repair the data stripe.
			
 
				+		 */
			
 
				+		if (failp != rbio->scrubp)
			
 
				+			goto cleanup;
			
 
				+
			
 
				+		__raid_recover_end_io(rbio);
			
 
				+	} else {
			
 
				+		finish_parity_scrub(rbio, 1);
			
 
				+	}
			
 
				+	return;
			
 
				+
			
 
				+cleanup:
			
 
				+	rbio_orig_end_io(rbio, -EIO, 0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * end io for the read phase of the rmw cycle.  All the bios here are physical
			
 
				+ * stripe bios we've read from the disk so we can recalculate the parity of the
			
 
				+ * stripe.
			
 
				+ *
			
 
				+ * This will usually kick off finish_rmw once all the bios are read in, but it
			
 
				+ * may trigger parity reconstruction if we had any errors along the way
			
 
				+ */
			
 
				+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
			
 
				+{
			
 
				+	struct btrfs_raid_bio *rbio = bio->bi_private;
			
 
				+
			
 
				+	if (err)
			
 
				+		fail_bio_stripe(rbio, bio);
			
 
				+	else
			
 
				+		set_bio_pages_uptodate(bio);
			
 
				+
			
 
				+	bio_put(bio);
			
 
				+
			
 
				+	if (!atomic_dec_and_test(&rbio->stripes_pending))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * this will normally call finish_rmw to start our write
			
 
				+	 * but if there are any failed stripes we'll reconstruct
			
 
				+	 * from parity first
			
 
				+	 */
			
 
				+	validate_rbio_for_parity_scrub(rbio);
			
 
				+}
			
 
				+
			
 
				+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	int bios_to_read = 0;
			
 
				+	struct bio_list bio_list;
			
 
				+	int ret;
			
 
				+	int pagenr;
			
 
				+	int stripe;
			
 
				+	struct bio *bio;
			
 
				+
			
 
				+	ret = alloc_rbio_essential_pages(rbio);
			
 
				+	if (ret)
			
 
				+		goto cleanup;
			
 
				+
			
 
				+	bio_list_init(&bio_list);
			
 
				+
			
 
				+	atomic_set(&rbio->error, 0);
			
 
				+	/*
			
 
				+	 * build a list of bios to read all the missing parts of this
			
 
				+	 * stripe
			
 
				+	 */
			
 
				+	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
			
 
				+		for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
			
 
				+			struct page *page;
			
 
				+			/*
			
 
				+			 * we want to find all the pages missing from
			
 
				+			 * the rbio and read them from the disk.  If
			
 
				+			 * page_in_rbio finds a page in the bio list
			
 
				+			 * we don't need to read it off the stripe.
			
 
				+			 */
			
 
				+			page = page_in_rbio(rbio, stripe, pagenr, 1);
			
 
				+			if (page)
			
 
				+				continue;
			
 
				+
			
 
				+			page = rbio_stripe_page(rbio, stripe, pagenr);
			
 
				+			/*
			
 
				+			 * the bio cache may have handed us an uptodate
			
 
				+			 * page.  If so, be happy and use it
			
 
				+			 */
			
 
				+			if (PageUptodate(page))
			
 
				+				continue;
			
 
				+
			
 
				+			ret = rbio_add_io_page(rbio, &bio_list, page,
			
 
				+				       stripe, pagenr, rbio->stripe_len);
			
 
				+			if (ret)
			
 
				+				goto cleanup;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	bios_to_read = bio_list_size(&bio_list);
			
 
				+	if (!bios_to_read) {
			
 
				+		/*
			
 
				+		 * this can happen if others have merged with
			
 
				+		 * us, it means there is nothing left to read.
			
 
				+		 * But if there are missing devices it may not be
			
 
				+		 * safe to do the full stripe write yet.
			
 
				+		 */
			
 
				+		goto finish;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * the bbio may be freed once we submit the last bio.  Make sure
			
 
				+	 * not to touch it after that
			
 
				+	 */
			
 
				+	atomic_set(&rbio->stripes_pending, bios_to_read);
			
 
				+	while (1) {
			
 
				+		bio = bio_list_pop(&bio_list);
			
 
				+		if (!bio)
			
 
				+			break;
			
 
				+
			
 
				+		bio->bi_private = rbio;
			
 
				+		bio->bi_end_io = raid56_parity_scrub_end_io;
			
 
				+
			
 
				+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
			
 
				+				    BTRFS_WQ_ENDIO_RAID56);
			
 
				+
			
 
				+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
			
 
				+		submit_bio(READ, bio);
			
 
				+	}
			
 
				+	/* the actual write will happen once the reads are done */
			
 
				+	return;
			
 
				+
			
 
				+cleanup:
			
 
				+	rbio_orig_end_io(rbio, -EIO, 0);
			
 
				+	return;
			
 
				+
			
 
				+finish:
			
 
				+	validate_rbio_for_parity_scrub(rbio);
			
 
				+}
			
 
				+
			
 
				+static void scrub_parity_work(struct btrfs_work *work)
			
 
				+{
			
 
				+	struct btrfs_raid_bio *rbio;
			
 
				+
			
 
				+	rbio = container_of(work, struct btrfs_raid_bio, work);
			
 
				+	raid56_parity_scrub_stripe(rbio);
			
 
				+}
			
 
				+
			
 
				+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
			
 
				+			scrub_parity_work, NULL, NULL);
			
 
				+
			
 
				+	btrfs_queue_work(rbio->fs_info->rmw_workers,
			
 
				+			 &rbio->work);
			
 
				+}
			
 
				+
			
 
				+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	if (!lock_stripe_add(rbio))
			
 
				+		async_scrub_parity(rbio);
			
 
				+}
			
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
 
				 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
			
 
				 			     ((x) == RAID6_Q_STRIPE))
			
 
				 
			
 
				+struct btrfs_raid_bio;
			
 
				+struct btrfs_device;
			
 
				+
			
 
				 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
			
 
				-				 struct btrfs_bio *bbio, u64 *raid_map,
			
 
				-				 u64 stripe_len, int mirror_num);
			
 
				+			  struct btrfs_bio *bbio, u64 *raid_map,
			
 
				+			  u64 stripe_len, int mirror_num, int generic_io);
			
 
				 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
			
 
				 			       struct btrfs_bio *bbio, u64 *raid_map,
			
 
				 			       u64 stripe_len);
			
 
				 
			
 
				+struct btrfs_raid_bio *
			
 
				+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
			
 
				+			       struct btrfs_bio *bbio, u64 *raid_map,
			
 
				+			       u64 stripe_len, struct btrfs_device *scrub_dev,
			
 
				+			       unsigned long *dbitmap, int stripe_nsectors);
			
 
				+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
			
 
				+				   struct page *page, u64 logical);
			
 
				+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
			
 
				+
			
 
				 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
			
 
				 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
			
 
				 #endif
			
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
 
				  */
			
 
				 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
			
 
				 
			
 
				+struct scrub_recover {
			
 
				+	atomic_t		refs;
			
 
				+	struct btrfs_bio	*bbio;
			
 
				+	u64			*raid_map;
			
 
				+	u64			map_length;
			
 
				+};
			
 
				+
			
 
				 struct scrub_page {
			
 
				 	struct scrub_block	*sblock;
			
 
				 	struct page		*page;
			
 
				 	struct btrfs_device	*dev;
			
 
				+	struct list_head	list;
			
 
				 	u64			flags;  /* extent flags */
			
 
				 	u64			generation;
			
 
				 	u64			logical;
			
@@ -79,6 +87,8 @@ struct scrub_page {
 
				 		unsigned int	io_error:1;
			
 
				 	};
			
 
				 	u8			csum[BTRFS_CSUM_SIZE];
			
 
				+
			
 
				+	struct scrub_recover	*recover;
			
 
				 };
			
 
				 
			
 
				 struct scrub_bio {
			
@@ -105,14 +115,52 @@ struct scrub_block {
 
				 	atomic_t		outstanding_pages;
			
 
				 	atomic_t		ref_count; /* free mem on transition to zero */
			
 
				 	struct scrub_ctx	*sctx;
			
 
				+	struct scrub_parity	*sparity;
			
 
				 	struct {
			
 
				 		unsigned int	header_error:1;
			
 
				 		unsigned int	checksum_error:1;
			
 
				 		unsigned int	no_io_error_seen:1;
			
 
				 		unsigned int	generation_error:1; /* also sets header_error */
			
 
				+
			
 
				+		/* The following is for the data used to check parity */
			
 
				+		/* It is for the data with checksum */
			
 
				+		unsigned int	data_corrected:1;
			
 
				 	};
			
 
				 };
			
 
				 
			
 
				+/* Used for the chunks with parity stripe such RAID5/6 */
			
 
				+struct scrub_parity {
			
 
				+	struct scrub_ctx	*sctx;
			
 
				+
			
 
				+	struct btrfs_device	*scrub_dev;
			
 
				+
			
 
				+	u64			logic_start;
			
 
				+
			
 
				+	u64			logic_end;
			
 
				+
			
 
				+	int			nsectors;
			
 
				+
			
 
				+	int			stripe_len;
			
 
				+
			
 
				+	atomic_t		ref_count;
			
 
				+
			
 
				+	struct list_head	spages;
			
 
				+
			
 
				+	/* Work of parity check and repair */
			
 
				+	struct btrfs_work	work;
			
 
				+
			
 
				+	/* Mark the parity blocks which have data */
			
 
				+	unsigned long		*dbitmap;
			
 
				+
			
 
				+	/*
			
 
				+	 * Mark the parity blocks which have data, but errors happen when
			
 
				+	 * read data or check data
			
 
				+	 */
			
 
				+	unsigned long		*ebitmap;
			
 
				+
			
 
				+	unsigned long		bitmap[0];
			
 
				+};
			
 
				+
			
 
				 struct scrub_wr_ctx {
			
 
				 	struct scrub_bio *wr_curr_bio;
			
 
				 	struct btrfs_device *tgtdev;
			
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 
				 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
			
 
				 				struct scrub_block *sblock, int is_metadata,
			
 
				 				int have_csum, u8 *csum, u64 generation,
			
 
				-				u16 csum_size);
			
 
				+				u16 csum_size, int retry_failed_mirror);
			
 
				 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
			
 
				 					 struct scrub_block *sblock,
			
 
				 					 int is_metadata, int have_csum,
			
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
 
				 static void scrub_block_put(struct scrub_block *sblock);
			
 
				 static void scrub_page_get(struct scrub_page *spage);
			
 
				 static void scrub_page_put(struct scrub_page *spage);
			
 
				+static void scrub_parity_get(struct scrub_parity *sparity);
			
 
				+static void scrub_parity_put(struct scrub_parity *sparity);
			
 
				 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
			
 
				 				    struct scrub_page *spage);
			
 
				 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
			
@@ -790,6 +840,20 @@ out:
 
				 	scrub_pending_trans_workers_dec(sctx);
			
 
				 }
			
 
				 
			
 
				+static inline void scrub_get_recover(struct scrub_recover *recover)
			
 
				+{
			
 
				+	atomic_inc(&recover->refs);
			
 
				+}
			
 
				+
			
 
				+static inline void scrub_put_recover(struct scrub_recover *recover)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&recover->refs)) {
			
 
				+		kfree(recover->bbio);
			
 
				+		kfree(recover->raid_map);
			
 
				+		kfree(recover);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * scrub_handle_errored_block gets called when either verification of the
			
 
				  * pages failed or the bio failed to read, e.g. with EIO. In the latter
			
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 
			
 
				 	/* build and submit the bios for the failed mirror, check checksums */
			
 
				 	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
			
 
				-			    csum, generation, sctx->csum_size);
			
 
				+			    csum, generation, sctx->csum_size, 1);
			
 
				 
			
 
				 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
			
 
				 	    sblock_bad->no_io_error_seen) {
			
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
				 		 */
			
 
				 		spin_lock(&sctx->stat_lock);
			
 
				 		sctx->stat.unverified_errors++;
			
 
				+		sblock_to_check->data_corrected = 1;
			
 
				 		spin_unlock(&sctx->stat_lock);
			
 
				 
			
 
				 		if (sctx->is_dev_replace)
			
@@ -1019,7 +1084,7 @@ nodatasum_case:
 
				 		/* build and submit the bios, check checksums */
			
 
				 		scrub_recheck_block(fs_info, sblock_other, is_metadata,
			
 
				 				    have_csum, csum, generation,
			
 
				-				    sctx->csum_size);
			
 
				+				    sctx->csum_size, 0);
			
 
				 
			
 
				 		if (!sblock_other->header_error &&
			
 
				 		    !sblock_other->checksum_error &&
			
@@ -1169,7 +1234,7 @@ nodatasum_case:
 
				 			 */
			
 
				 			scrub_recheck_block(fs_info, sblock_bad,
			
 
				 					    is_metadata, have_csum, csum,
			
 
				-					    generation, sctx->csum_size);
			
 
				+					    generation, sctx->csum_size, 1);
			
 
				 			if (!sblock_bad->header_error &&
			
 
				 			    !sblock_bad->checksum_error &&
			
 
				 			    sblock_bad->no_io_error_seen)
			
@@ -1180,6 +1245,7 @@ nodatasum_case:
 
				 corrected_error:
			
 
				 			spin_lock(&sctx->stat_lock);
			
 
				 			sctx->stat.corrected_errors++;
			
 
				+			sblock_to_check->data_corrected = 1;
			
 
				 			spin_unlock(&sctx->stat_lock);
			
 
				 			printk_ratelimited_in_rcu(KERN_ERR
			
 
				 				"BTRFS: fixed up error at logical %llu on dev %s\n",
			
@@ -1201,11 +1267,18 @@ out:
 
				 		     mirror_index++) {
			
 
				 			struct scrub_block *sblock = sblocks_for_recheck +
			
 
				 						     mirror_index;
			
 
				+			struct scrub_recover *recover;
			
 
				 			int page_index;
			
 
				 
			
 
				 			for (page_index = 0; page_index < sblock->page_count;
			
 
				 			     page_index++) {
			
 
				 				sblock->pagev[page_index]->sblock = NULL;
			
 
				+				recover = sblock->pagev[page_index]->recover;
			
 
				+				if (recover) {
			
 
				+					scrub_put_recover(recover);
			
 
				+					sblock->pagev[page_index]->recover =
			
 
				+									NULL;
			
 
				+				}
			
 
				 				scrub_page_put(sblock->pagev[page_index]);
			
 
				 			}
			
 
				 		}
			
@@ -1215,14 +1288,63 @@ out:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
			
 
				+{
			
 
				+	if (raid_map) {
			
 
				+		if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
			
 
				+			return 3;
			
 
				+		else
			
 
				+			return 2;
			
 
				+	} else {
			
 
				+		return (int)bbio->num_stripes;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
			
 
				+						 u64 mapped_length,
			
 
				+						 int nstripes, int mirror,
			
 
				+						 int *stripe_index,
			
 
				+						 u64 *stripe_offset)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (raid_map) {
			
 
				+		/* RAID5/6 */
			
 
				+		for (i = 0; i < nstripes; i++) {
			
 
				+			if (raid_map[i] == RAID6_Q_STRIPE ||
			
 
				+			    raid_map[i] == RAID5_P_STRIPE)
			
 
				+				continue;
			
 
				+
			
 
				+			if (logical >= raid_map[i] &&
			
 
				+			    logical < raid_map[i] + mapped_length)
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		*stripe_index = i;
			
 
				+		*stripe_offset = logical - raid_map[i];
			
 
				+	} else {
			
 
				+		/* The other RAID type */
			
 
				+		*stripe_index = mirror;
			
 
				+		*stripe_offset = 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
			
 
				 				     struct btrfs_fs_info *fs_info,
			
 
				 				     struct scrub_block *original_sblock,
			
 
				 				     u64 length, u64 logical,
			
 
				 				     struct scrub_block *sblocks_for_recheck)
			
 
				 {
			
 
				+	struct scrub_recover *recover;
			
 
				+	struct btrfs_bio *bbio;
			
 
				+	u64 *raid_map;
			
 
				+	u64 sublen;
			
 
				+	u64 mapped_length;
			
 
				+	u64 stripe_offset;
			
 
				+	int stripe_index;
			
 
				 	int page_index;
			
 
				 	int mirror_index;
			
 
				+	int nmirrors;
			
 
				 	int ret;
			
 
				 
			
 
				 	/*
			
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 
				 
			
 
				 	page_index = 0;
			
 
				 	while (length > 0) {
			
 
				-		u64 sublen = min_t(u64, length, PAGE_SIZE);
			
 
				-		u64 mapped_length = sublen;
			
 
				-		struct btrfs_bio *bbio = NULL;
			
 
				+		sublen = min_t(u64, length, PAGE_SIZE);
			
 
				+		mapped_length = sublen;
			
 
				+		bbio = NULL;
			
 
				+		raid_map = NULL;
			
 
				 
			
 
				 		/*
			
 
				 		 * with a length of PAGE_SIZE, each returned stripe
			
 
				 		 * represents one mirror
			
 
				 		 */
			
 
				-		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
			
 
				-				      &mapped_length, &bbio, 0);
			
 
				+		ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
			
 
				+				       &mapped_length, &bbio, 0, &raid_map);
			
 
				 		if (ret || !bbio || mapped_length < sublen) {
			
 
				 			kfree(bbio);
			
 
				+			kfree(raid_map);
			
 
				 			return -EIO;
			
 
				 		}
			
 
				 
			
 
				+		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
			
 
				+		if (!recover) {
			
 
				+			kfree(bbio);
			
 
				+			kfree(raid_map);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+
			
 
				+		atomic_set(&recover->refs, 1);
			
 
				+		recover->bbio = bbio;
			
 
				+		recover->raid_map = raid_map;
			
 
				+		recover->map_length = mapped_length;
			
 
				+
			
 
				 		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
			
 
				-		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
			
 
				+
			
 
				+		nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
			
 
				+		for (mirror_index = 0; mirror_index < nmirrors;
			
 
				 		     mirror_index++) {
			
 
				 			struct scrub_block *sblock;
			
 
				 			struct scrub_page *page;
			
@@ -1265,26 +1403,38 @@ leave_nomem:
 
				 				spin_lock(&sctx->stat_lock);
			
 
				 				sctx->stat.malloc_errors++;
			
 
				 				spin_unlock(&sctx->stat_lock);
			
 
				-				kfree(bbio);
			
 
				+				scrub_put_recover(recover);
			
 
				 				return -ENOMEM;
			
 
				 			}
			
 
				 			scrub_page_get(page);
			
 
				 			sblock->pagev[page_index] = page;
			
 
				 			page->logical = logical;
			
 
				-			page->physical = bbio->stripes[mirror_index].physical;
			
 
				+
			
 
				+			scrub_stripe_index_and_offset(logical, raid_map,
			
 
				+						      mapped_length,
			
 
				+						      bbio->num_stripes,
			
 
				+						      mirror_index,
			
 
				+						      &stripe_index,
			
 
				+						      &stripe_offset);
			
 
				+			page->physical = bbio->stripes[stripe_index].physical +
			
 
				+					 stripe_offset;
			
 
				+			page->dev = bbio->stripes[stripe_index].dev;
			
 
				+
			
 
				 			BUG_ON(page_index >= original_sblock->page_count);
			
 
				 			page->physical_for_dev_replace =
			
 
				 				original_sblock->pagev[page_index]->
			
 
				 				physical_for_dev_replace;
			
 
				 			/* for missing devices, dev->bdev is NULL */
			
 
				-			page->dev = bbio->stripes[mirror_index].dev;
			
 
				 			page->mirror_num = mirror_index + 1;
			
 
				 			sblock->page_count++;
			
 
				 			page->page = alloc_page(GFP_NOFS);
			
 
				 			if (!page->page)
			
 
				 				goto leave_nomem;
			
 
				+
			
 
				+			scrub_get_recover(recover);
			
 
				+			page->recover = recover;
			
 
				 		}
			
 
				-		kfree(bbio);
			
 
				+		scrub_put_recover(recover);
			
 
				 		length -= sublen;
			
 
				 		logical += sublen;
			
 
				 		page_index++;
			
@@ -1293,6 +1443,51 @@ leave_nomem:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+struct scrub_bio_ret {
			
 
				+	struct completion event;
			
 
				+	int error;
			
 
				+};
			
 
				+
			
 
				+static void scrub_bio_wait_endio(struct bio *bio, int error)
			
 
				+{
			
 
				+	struct scrub_bio_ret *ret = bio->bi_private;
			
 
				+
			
 
				+	ret->error = error;
			
 
				+	complete(&ret->event);
			
 
				+}
			
 
				+
			
 
				+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
			
 
				+{
			
 
				+	return page->recover && page->recover->raid_map;
			
 
				+}
			
 
				+
			
 
				+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
			
 
				+					struct bio *bio,
			
 
				+					struct scrub_page *page)
			
 
				+{
			
 
				+	struct scrub_bio_ret done;
			
 
				+	int ret;
			
 
				+
			
 
				+	init_completion(&done.event);
			
 
				+	done.error = 0;
			
 
				+	bio->bi_iter.bi_sector = page->logical >> 9;
			
 
				+	bio->bi_private = &done;
			
 
				+	bio->bi_end_io = scrub_bio_wait_endio;
			
 
				+
			
 
				+	ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
			
 
				+				    page->recover->raid_map,
			
 
				+				    page->recover->map_length,
			
 
				+				    page->mirror_num, 0);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	wait_for_completion(&done.event);
			
 
				+	if (done.error)
			
 
				+		return -EIO;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * this function will check the on disk data for checksum errors, header
			
 
				  * errors and read I/O errors. If any I/O errors happen, the exact pages
			
@@ -1303,7 +1498,7 @@ leave_nomem:
 
				 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
			
 
				 				struct scrub_block *sblock, int is_metadata,
			
 
				 				int have_csum, u8 *csum, u64 generation,
			
 
				-				u16 csum_size)
			
 
				+				u16 csum_size, int retry_failed_mirror)
			
 
				 {
			
 
				 	int page_num;
			
 
				 
			
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
				 			continue;
			
 
				 		}
			
 
				 		bio->bi_bdev = page->dev->bdev;
			
 
				-		bio->bi_iter.bi_sector = page->physical >> 9;
			
 
				 
			
 
				 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
			
 
				-		if (btrfsic_submit_bio_wait(READ, bio))
			
 
				-			sblock->no_io_error_seen = 0;
			
 
				+		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
			
 
				+			if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
			
 
				+				sblock->no_io_error_seen = 0;
			
 
				+		} else {
			
 
				+			bio->bi_iter.bi_sector = page->physical >> 9;
			
 
				+
			
 
				+			if (btrfsic_submit_bio_wait(READ, bio))
			
 
				+				sblock->no_io_error_seen = 0;
			
 
				+		}
			
 
				 
			
 
				 		bio_put(bio);
			
 
				 	}
			
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
 
				 {
			
 
				 	int page_num;
			
 
				 
			
 
				+	/*
			
 
				+	 * This block is used for the check of the parity on the source device,
			
 
				+	 * so the data needn't be written into the destination device.
			
 
				+	 */
			
 
				+	if (sblock->sparity)
			
 
				+		return;
			
 
				+
			
 
				 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
			
 
				 		int ret;
			
 
				 
			
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
 
				 	if (atomic_dec_and_test(&sblock->ref_count)) {
			
 
				 		int i;
			
 
				 
			
 
				+		if (sblock->sparity)
			
 
				+			scrub_parity_put(sblock->sparity);
			
 
				+
			
 
				 		for (i = 0; i < sblock->page_count; i++)
			
 
				 			scrub_page_put(sblock->pagev[i]);
			
 
				 		kfree(sblock);
			
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 
				 	scrub_pending_bio_dec(sctx);
			
 
				 }
			
 
				 
			
 
				+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
			
 
				+				       unsigned long *bitmap,
			
 
				+				       u64 start, u64 len)
			
 
				+{
			
 
				+	int offset;
			
 
				+	int nsectors;
			
 
				+	int sectorsize = sparity->sctx->dev_root->sectorsize;
			
 
				+
			
 
				+	if (len >= sparity->stripe_len) {
			
 
				+		bitmap_set(bitmap, 0, sparity->nsectors);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	start -= sparity->logic_start;
			
 
				+	offset = (int)do_div(start, sparity->stripe_len);
			
 
				+	offset /= sectorsize;
			
 
				+	nsectors = (int)len / sectorsize;
			
 
				+
			
 
				+	if (offset + nsectors <= sparity->nsectors) {
			
 
				+		bitmap_set(bitmap, offset, nsectors);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	bitmap_set(bitmap, offset, sparity->nsectors - offset);
			
 
				+	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
			
 
				+}
			
 
				+
			
 
				+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
			
 
				+						   u64 start, u64 len)
			
 
				+{
			
 
				+	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
			
 
				+}
			
 
				+
			
 
				+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
			
 
				+						  u64 start, u64 len)
			
 
				+{
			
 
				+	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
			
 
				+}
			
 
				+
			
 
				 static void scrub_block_complete(struct scrub_block *sblock)
			
 
				 {
			
 
				+	int corrupted = 0;
			
 
				+
			
 
				 	if (!sblock->no_io_error_seen) {
			
 
				+		corrupted = 1;
			
 
				 		scrub_handle_errored_block(sblock);
			
 
				 	} else {
			
 
				 		/*
			
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
 
				 		 * dev replace case, otherwise write here in dev replace
			
 
				 		 * case.
			
 
				 		 */
			
 
				-		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
			
 
				+		corrupted = scrub_checksum(sblock);
			
 
				+		if (!corrupted && sblock->sctx->is_dev_replace)
			
 
				 			scrub_write_block_to_dev_replace(sblock);
			
 
				 	}
			
 
				+
			
 
				+	if (sblock->sparity && corrupted && !sblock->data_corrected) {
			
 
				+		u64 start = sblock->pagev[0]->logical;
			
 
				+		u64 end = sblock->pagev[sblock->page_count - 1]->logical +
			
 
				+			  PAGE_SIZE;
			
 
				+
			
 
				+		scrub_parity_mark_sectors_error(sblock->sparity,
			
 
				+						start, end - start);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
			
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int scrub_pages_for_parity(struct scrub_parity *sparity,
			
 
				+				  u64 logical, u64 len,
			
 
				+				  u64 physical, struct btrfs_device *dev,
			
 
				+				  u64 flags, u64 gen, int mirror_num, u8 *csum)
			
 
				+{
			
 
				+	struct scrub_ctx *sctx = sparity->sctx;
			
 
				+	struct scrub_block *sblock;
			
 
				+	int index;
			
 
				+
			
 
				+	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
			
 
				+	if (!sblock) {
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	/* one ref inside this function, plus one for each page added to
			
 
				+	 * a bio later on */
			
 
				+	atomic_set(&sblock->ref_count, 1);
			
 
				+	sblock->sctx = sctx;
			
 
				+	sblock->no_io_error_seen = 1;
			
 
				+	sblock->sparity = sparity;
			
 
				+	scrub_parity_get(sparity);
			
 
				+
			
 
				+	for (index = 0; len > 0; index++) {
			
 
				+		struct scrub_page *spage;
			
 
				+		u64 l = min_t(u64, len, PAGE_SIZE);
			
 
				+
			
 
				+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
			
 
				+		if (!spage) {
			
 
				+leave_nomem:
			
 
				+			spin_lock(&sctx->stat_lock);
			
 
				+			sctx->stat.malloc_errors++;
			
 
				+			spin_unlock(&sctx->stat_lock);
			
 
				+			scrub_block_put(sblock);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
			
 
				+		/* For scrub block */
			
 
				+		scrub_page_get(spage);
			
 
				+		sblock->pagev[index] = spage;
			
 
				+		/* For scrub parity */
			
 
				+		scrub_page_get(spage);
			
 
				+		list_add_tail(&spage->list, &sparity->spages);
			
 
				+		spage->sblock = sblock;
			
 
				+		spage->dev = dev;
			
 
				+		spage->flags = flags;
			
 
				+		spage->generation = gen;
			
 
				+		spage->logical = logical;
			
 
				+		spage->physical = physical;
			
 
				+		spage->mirror_num = mirror_num;
			
 
				+		if (csum) {
			
 
				+			spage->have_csum = 1;
			
 
				+			memcpy(spage->csum, csum, sctx->csum_size);
			
 
				+		} else {
			
 
				+			spage->have_csum = 0;
			
 
				+		}
			
 
				+		sblock->page_count++;
			
 
				+		spage->page = alloc_page(GFP_NOFS);
			
 
				+		if (!spage->page)
			
 
				+			goto leave_nomem;
			
 
				+		len -= l;
			
 
				+		logical += l;
			
 
				+		physical += l;
			
 
				+	}
			
 
				+
			
 
				+	WARN_ON(sblock->page_count == 0);
			
 
				+	for (index = 0; index < sblock->page_count; index++) {
			
 
				+		struct scrub_page *spage = sblock->pagev[index];
			
 
				+		int ret;
			
 
				+
			
 
				+		ret = scrub_add_page_to_rd_bio(sctx, spage);
			
 
				+		if (ret) {
			
 
				+			scrub_block_put(sblock);
			
 
				+			return ret;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* last one frees, either here or in bio completion for last page */
			
 
				+	scrub_block_put(sblock);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int scrub_extent_for_parity(struct scrub_parity *sparity,
			
 
				+				   u64 logical, u64 len,
			
 
				+				   u64 physical, struct btrfs_device *dev,
			
 
				+				   u64 flags, u64 gen, int mirror_num)
			
 
				+{
			
 
				+	struct scrub_ctx *sctx = sparity->sctx;
			
 
				+	int ret;
			
 
				+	u8 csum[BTRFS_CSUM_SIZE];
			
 
				+	u32 blocksize;
			
 
				+
			
 
				+	if (flags & BTRFS_EXTENT_FLAG_DATA) {
			
 
				+		blocksize = sctx->sectorsize;
			
 
				+	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			
 
				+		blocksize = sctx->nodesize;
			
 
				+	} else {
			
 
				+		blocksize = sctx->sectorsize;
			
 
				+		WARN_ON(1);
			
 
				+	}
			
 
				+
			
 
				+	while (len) {
			
 
				+		u64 l = min_t(u64, len, blocksize);
			
 
				+		int have_csum = 0;
			
 
				+
			
 
				+		if (flags & BTRFS_EXTENT_FLAG_DATA) {
			
 
				+			/* push csums to sbio */
			
 
				+			have_csum = scrub_find_csum(sctx, logical, l, csum);
			
 
				+			if (have_csum == 0)
			
 
				+				goto skip;
			
 
				+		}
			
 
				+		ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
			
 
				+					     flags, gen, mirror_num,
			
 
				+					     have_csum ? csum : NULL);
			
 
				+skip:
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+		len -= l;
			
 
				+		logical += l;
			
 
				+		physical += l;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Given a physical address, this will calculate it's
			
 
				  * logical offset. if this is a parity stripe, it will return
			
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
 
				  * return 0 if it is a data stripe, 1 means parity stripe.
			
 
				  */
			
 
				 static int get_raid56_logic_offset(u64 physical, int num,
			
 
				-				   struct map_lookup *map, u64 *offset)
			
 
				+				   struct map_lookup *map, u64 *offset,
			
 
				+				   u64 *stripe_start)
			
 
				 {
			
 
				 	int i;
			
 
				 	int j = 0;
			
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
 
				 
			
 
				 	last_offset = (physical - map->stripes[num].physical) *
			
 
				 		      nr_data_stripes(map);
			
 
				+	if (stripe_start)
			
 
				+		*stripe_start = last_offset;
			
 
				+
			
 
				 	*offset = last_offset;
			
 
				 	for (i = 0; i < nr_data_stripes(map); i++) {
			
 
				 		*offset = last_offset + i * map->stripe_len;
			
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+static void scrub_free_parity(struct scrub_parity *sparity)
			
 
				+{
			
 
				+	struct scrub_ctx *sctx = sparity->sctx;
			
 
				+	struct scrub_page *curr, *next;
			
 
				+	int nbits;
			
 
				+
			
 
				+	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
			
 
				+	if (nbits) {
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.read_errors += nbits;
			
 
				+		sctx->stat.uncorrectable_errors += nbits;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+	}
			
 
				+
			
 
				+	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
			
 
				+		list_del_init(&curr->list);
			
 
				+		scrub_page_put(curr);
			
 
				+	}
			
 
				+
			
 
				+	kfree(sparity);
			
 
				+}
			
 
				+
			
 
				+static void scrub_parity_bio_endio(struct bio *bio, int error)
			
 
				+{
			
 
				+	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
			
 
				+	struct scrub_ctx *sctx = sparity->sctx;
			
 
				+
			
 
				+	if (error)
			
 
				+		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
			
 
				+			  sparity->nsectors);
			
 
				+
			
 
				+	scrub_free_parity(sparity);
			
 
				+	scrub_pending_bio_dec(sctx);
			
 
				+	bio_put(bio);
			
 
				+}
			
 
				+
			
 
				+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
			
 
				+{
			
 
				+	struct scrub_ctx *sctx = sparity->sctx;
			
 
				+	struct bio *bio;
			
 
				+	struct btrfs_raid_bio *rbio;
			
 
				+	struct scrub_page *spage;
			
 
				+	struct btrfs_bio *bbio = NULL;
			
 
				+	u64 *raid_map = NULL;
			
 
				+	u64 length;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
			
 
				+			   sparity->nsectors))
			
 
				+		goto out;
			
 
				+
			
 
				+	length = sparity->logic_end - sparity->logic_start + 1;
			
 
				+	ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
			
 
				+			       sparity->logic_start,
			
 
				+			       &length, &bbio, 0, &raid_map);
			
 
				+	if (ret || !bbio || !raid_map)
			
 
				+		goto bbio_out;
			
 
				+
			
 
				+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
			
 
				+	if (!bio)
			
 
				+		goto bbio_out;
			
 
				+
			
 
				+	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
			
 
				+	bio->bi_private = sparity;
			
 
				+	bio->bi_end_io = scrub_parity_bio_endio;
			
 
				+
			
 
				+	rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
			
 
				+					      raid_map, length,
			
 
				+					      sparity->scrub_dev,
			
 
				+					      sparity->dbitmap,
			
 
				+					      sparity->nsectors);
			
 
				+	if (!rbio)
			
 
				+		goto rbio_out;
			
 
				+
			
 
				+	list_for_each_entry(spage, &sparity->spages, list)
			
 
				+		raid56_parity_add_scrub_pages(rbio, spage->page,
			
 
				+					      spage->logical);
			
 
				+
			
 
				+	scrub_pending_bio_inc(sctx);
			
 
				+	raid56_parity_submit_scrub_rbio(rbio);
			
 
				+	return;
			
 
				+
			
 
				+rbio_out:
			
 
				+	bio_put(bio);
			
 
				+bbio_out:
			
 
				+	kfree(bbio);
			
 
				+	kfree(raid_map);
			
 
				+	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
			
 
				+		  sparity->nsectors);
			
 
				+	spin_lock(&sctx->stat_lock);
			
 
				+	sctx->stat.malloc_errors++;
			
 
				+	spin_unlock(&sctx->stat_lock);
			
 
				+out:
			
 
				+	scrub_free_parity(sparity);
			
 
				+}
			
 
				+
			
 
				+static inline int scrub_calc_parity_bitmap_len(int nsectors)
			
 
				+{
			
 
				+	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
			
 
				+}
			
 
				+
			
 
				+static void scrub_parity_get(struct scrub_parity *sparity)
			
 
				+{
			
 
				+	atomic_inc(&sparity->ref_count);
			
 
				+}
			
 
				+
			
 
				+static void scrub_parity_put(struct scrub_parity *sparity)
			
 
				+{
			
 
				+	if (!atomic_dec_and_test(&sparity->ref_count))
			
 
				+		return;
			
 
				+
			
 
				+	scrub_parity_check_and_repair(sparity);
			
 
				+}
			
 
				+
			
 
				+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
			
 
				+						  struct map_lookup *map,
			
 
				+						  struct btrfs_device *sdev,
			
 
				+						  struct btrfs_path *path,
			
 
				+						  u64 logic_start,
			
 
				+						  u64 logic_end)
			
 
				+{
			
 
				+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
			
 
				+	struct btrfs_root *root = fs_info->extent_root;
			
 
				+	struct btrfs_root *csum_root = fs_info->csum_root;
			
 
				+	struct btrfs_extent_item *extent;
			
 
				+	u64 flags;
			
 
				+	int ret;
			
 
				+	int slot;
			
 
				+	struct extent_buffer *l;
			
 
				+	struct btrfs_key key;
			
 
				+	u64 generation;
			
 
				+	u64 extent_logical;
			
 
				+	u64 extent_physical;
			
 
				+	u64 extent_len;
			
 
				+	struct btrfs_device *extent_dev;
			
 
				+	struct scrub_parity *sparity;
			
 
				+	int nsectors;
			
 
				+	int bitmap_len;
			
 
				+	int extent_mirror_num;
			
 
				+	int stop_loop = 0;
			
 
				+
			
 
				+	nsectors = map->stripe_len / root->sectorsize;
			
 
				+	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
			
 
				+	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
			
 
				+			  GFP_NOFS);
			
 
				+	if (!sparity) {
			
 
				+		spin_lock(&sctx->stat_lock);
			
 
				+		sctx->stat.malloc_errors++;
			
 
				+		spin_unlock(&sctx->stat_lock);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	sparity->stripe_len = map->stripe_len;
			
 
				+	sparity->nsectors = nsectors;
			
 
				+	sparity->sctx = sctx;
			
 
				+	sparity->scrub_dev = sdev;
			
 
				+	sparity->logic_start = logic_start;
			
 
				+	sparity->logic_end = logic_end;
			
 
				+	atomic_set(&sparity->ref_count, 1);
			
 
				+	INIT_LIST_HEAD(&sparity->spages);
			
 
				+	sparity->dbitmap = sparity->bitmap;
			
 
				+	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
			
 
				+
			
 
				+	ret = 0;
			
 
				+	while (logic_start < logic_end) {
			
 
				+		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
			
 
				+			key.type = BTRFS_METADATA_ITEM_KEY;
			
 
				+		else
			
 
				+			key.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				+		key.objectid = logic_start;
			
 
				+		key.offset = (u64)-1;
			
 
				+
			
 
				+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				+		if (ret < 0)
			
 
				+			goto out;
			
 
				+
			
 
				+		if (ret > 0) {
			
 
				+			ret = btrfs_previous_extent_item(root, path, 0);
			
 
				+			if (ret < 0)
			
 
				+				goto out;
			
 
				+			if (ret > 0) {
			
 
				+				btrfs_release_path(path);
			
 
				+				ret = btrfs_search_slot(NULL, root, &key,
			
 
				+							path, 0, 0);
			
 
				+				if (ret < 0)
			
 
				+					goto out;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		stop_loop = 0;
			
 
				+		while (1) {
			
 
				+			u64 bytes;
			
 
				+
			
 
				+			l = path->nodes[0];
			
 
				+			slot = path->slots[0];
			
 
				+			if (slot >= btrfs_header_nritems(l)) {
			
 
				+				ret = btrfs_next_leaf(root, path);
			
 
				+				if (ret == 0)
			
 
				+					continue;
			
 
				+				if (ret < 0)
			
 
				+					goto out;
			
 
				+
			
 
				+				stop_loop = 1;
			
 
				+				break;
			
 
				+			}
			
 
				+			btrfs_item_key_to_cpu(l, &key, slot);
			
 
				+
			
 
				+			if (key.type == BTRFS_METADATA_ITEM_KEY)
			
 
				+				bytes = root->nodesize;
			
 
				+			else
			
 
				+				bytes = key.offset;
			
 
				+
			
 
				+			if (key.objectid + bytes <= logic_start)
			
 
				+				goto next;
			
 
				+
			
 
				+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
			
 
				+			    key.type != BTRFS_METADATA_ITEM_KEY)
			
 
				+				goto next;
			
 
				+
			
 
				+			if (key.objectid > logic_end) {
			
 
				+				stop_loop = 1;
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			while (key.objectid >= logic_start + map->stripe_len)
			
 
				+				logic_start += map->stripe_len;
			
 
				+
			
 
				+			extent = btrfs_item_ptr(l, slot,
			
 
				+						struct btrfs_extent_item);
			
 
				+			flags = btrfs_extent_flags(l, extent);
			
 
				+			generation = btrfs_extent_generation(l, extent);
			
 
				+
			
 
				+			if (key.objectid < logic_start &&
			
 
				+			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
			
 
				+				btrfs_err(fs_info,
			
 
				+					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
			
 
				+					   key.objectid, logic_start);
			
 
				+				goto next;
			
 
				+			}
			
 
				+again:
			
 
				+			extent_logical = key.objectid;
			
 
				+			extent_len = bytes;
			
 
				+
			
 
				+			if (extent_logical < logic_start) {
			
 
				+				extent_len -= logic_start - extent_logical;
			
 
				+				extent_logical = logic_start;
			
 
				+			}
			
 
				+
			
 
				+			if (extent_logical + extent_len >
			
 
				+			    logic_start + map->stripe_len)
			
 
				+				extent_len = logic_start + map->stripe_len -
			
 
				+					     extent_logical;
			
 
				+
			
 
				+			scrub_parity_mark_sectors_data(sparity, extent_logical,
			
 
				+						       extent_len);
			
 
				+
			
 
				+			scrub_remap_extent(fs_info, extent_logical,
			
 
				+					   extent_len, &extent_physical,
			
 
				+					   &extent_dev,
			
 
				+					   &extent_mirror_num);
			
 
				+
			
 
				+			ret = btrfs_lookup_csums_range(csum_root,
			
 
				+						extent_logical,
			
 
				+						extent_logical + extent_len - 1,
			
 
				+						&sctx->csum_list, 1);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+
			
 
				+			ret = scrub_extent_for_parity(sparity, extent_logical,
			
 
				+						      extent_len,
			
 
				+						      extent_physical,
			
 
				+						      extent_dev, flags,
			
 
				+						      generation,
			
 
				+						      extent_mirror_num);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+
			
 
				+			scrub_free_csums(sctx);
			
 
				+			if (extent_logical + extent_len <
			
 
				+			    key.objectid + bytes) {
			
 
				+				logic_start += map->stripe_len;
			
 
				+
			
 
				+				if (logic_start >= logic_end) {
			
 
				+					stop_loop = 1;
			
 
				+					break;
			
 
				+				}
			
 
				+
			
 
				+				if (logic_start < key.objectid + bytes) {
			
 
				+					cond_resched();
			
 
				+					goto again;
			
 
				+				}
			
 
				+			}
			
 
				+next:
			
 
				+			path->slots[0]++;
			
 
				+		}
			
 
				+
			
 
				+		btrfs_release_path(path);
			
 
				+
			
 
				+		if (stop_loop)
			
 
				+			break;
			
 
				+
			
 
				+		logic_start += map->stripe_len;
			
 
				+	}
			
 
				+out:
			
 
				+	if (ret < 0)
			
 
				+		scrub_parity_mark_sectors_error(sparity, logic_start,
			
 
				+						logic_end - logic_start + 1);
			
 
				+	scrub_parity_put(sparity);
			
 
				+	scrub_submit(sctx);
			
 
				+	mutex_lock(&sctx->wr_ctx.wr_lock);
			
 
				+	scrub_wr_submit(sctx);
			
 
				+	mutex_unlock(&sctx->wr_ctx.wr_lock);
			
 
				+
			
 
				+	btrfs_release_path(path);
			
 
				+	return ret < 0 ? ret : 0;
			
 
				+}
			
 
				+
			
 
				 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
			
 
				 					   struct map_lookup *map,
			
 
				 					   struct btrfs_device *scrub_dev,
			
 
				 					   int num, u64 base, u64 length,
			
 
				 					   int is_dev_replace)
			
 
				 {
			
 
				-	struct btrfs_path *path;
			
 
				+	struct btrfs_path *path, *ppath;
			
 
				 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
			
 
				 	struct btrfs_root *root = fs_info->extent_root;
			
 
				 	struct btrfs_root *csum_root = fs_info->csum_root;
			
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
				 	u64 extent_logical;
			
 
				 	u64 extent_physical;
			
 
				 	u64 extent_len;
			
 
				+	u64 stripe_logical;
			
 
				+	u64 stripe_end;
			
 
				 	struct btrfs_device *extent_dev;
			
 
				 	int extent_mirror_num;
			
 
				 	int stop_loop = 0;
			
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
				 		mirror_num = num % map->num_stripes + 1;
			
 
				 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
			
 
				 				BTRFS_BLOCK_GROUP_RAID6)) {
			
 
				-		get_raid56_logic_offset(physical, num, map, &offset);
			
 
				+		get_raid56_logic_offset(physical, num, map, &offset, NULL);
			
 
				 		increment = map->stripe_len * nr_data_stripes(map);
			
 
				 		mirror_num = 1;
			
 
				 	} else {
			
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+	ppath = btrfs_alloc_path();
			
 
				+	if (!ppath) {
			
 
				+		btrfs_free_path(ppath);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * work on commit root. The related disk blocks are static as
			
 
				 	 * long as COW is applied. This means, it is save to rewrite
			
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
				 	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
			
 
				 			 BTRFS_BLOCK_GROUP_RAID6)) {
			
 
				 		get_raid56_logic_offset(physical_end, num,
			
 
				-					map, &logic_end);
			
 
				+					map, &logic_end, NULL);
			
 
				 		logic_end += base;
			
 
				 	} else {
			
 
				 		logic_end = logical + increment * nstripes;
			
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
				 		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
			
 
				 				BTRFS_BLOCK_GROUP_RAID6)) {
			
 
				 			ret = get_raid56_logic_offset(physical, num,
			
 
				-					map, &logical);
			
 
				+					map, &logical, &stripe_logical);
			
 
				 			logical += base;
			
 
				-			if (ret)
			
 
				+			if (ret) {
			
 
				+				stripe_logical += base;
			
 
				+				stripe_end = stripe_logical + increment - 1;
			
 
				+				ret = scrub_raid56_parity(sctx, map, scrub_dev,
			
 
				+						ppath, stripe_logical,
			
 
				+						stripe_end);
			
 
				+				if (ret)
			
 
				+					goto out;
			
 
				 				goto skip;
			
 
				+			}
			
 
				 		}
			
 
				 		/*
			
 
				 		 * canceled?
			
@@ -2558,13 +3284,25 @@ again:
 
				 					 * loop until we find next data stripe
			
 
				 					 * or we have finished all stripes.
			
 
				 					 */
			
 
				-					do {
			
 
				-						physical += map->stripe_len;
			
 
				-						ret = get_raid56_logic_offset(
			
 
				-								physical, num,
			
 
				-								map, &logical);
			
 
				-						logical += base;
			
 
				-					} while (physical < physical_end && ret);
			
 
				+loop:
			
 
				+					physical += map->stripe_len;
			
 
				+					ret = get_raid56_logic_offset(physical,
			
 
				+							num, map, &logical,
			
 
				+							&stripe_logical);
			
 
				+					logical += base;
			
 
				+
			
 
				+					if (ret && physical < physical_end) {
			
 
				+						stripe_logical += base;
			
 
				+						stripe_end = stripe_logical +
			
 
				+								increment - 1;
			
 
				+						ret = scrub_raid56_parity(sctx,
			
 
				+							map, scrub_dev, ppath,
			
 
				+							stripe_logical,
			
 
				+							stripe_end);
			
 
				+						if (ret)
			
 
				+							goto out;
			
 
				+						goto loop;
			
 
				+					}
			
 
				 				} else {
			
 
				 					physical += map->stripe_len;
			
 
				 					logical += increment;
			
@@ -2605,6 +3343,7 @@ out:
 
				 
			
 
				 	blk_finish_plug(&plug);
			
 
				 	btrfs_free_path(path);
			
 
				+	btrfs_free_path(ppath);
			
 
				 	return ret < 0 ? ret : 0;
			
 
				 }
			
 
				 
			
@@ -3310,6 +4049,50 @@ out:
 
				 	scrub_pending_trans_workers_dec(sctx);
			
 
				 }
			
 
				 
			
 
				+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
			
 
				+				 u64 logical)
			
 
				+{
			
 
				+	struct extent_state *cached_state = NULL;
			
 
				+	struct btrfs_ordered_extent *ordered;
			
 
				+	struct extent_io_tree *io_tree;
			
 
				+	struct extent_map *em;
			
 
				+	u64 lockstart = start, lockend = start + len - 1;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	io_tree = &BTRFS_I(inode)->io_tree;
			
 
				+
			
 
				+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
			
 
				+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
			
 
				+	if (ordered) {
			
 
				+		btrfs_put_ordered_extent(ordered);
			
 
				+		ret = 1;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
			
 
				+	if (IS_ERR(em)) {
			
 
				+		ret = PTR_ERR(em);
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * This extent does not actually cover the logical extent anymore,
			
 
				+	 * move on to the next inode.
			
 
				+	 */
			
 
				+	if (em->block_start > logical ||
			
 
				+	    em->block_start + em->block_len < logical + len) {
			
 
				+		free_extent_map(em);
			
 
				+		ret = 1;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+	free_extent_map(em);
			
 
				+
			
 
				+out_unlock:
			
 
				+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
			
 
				+			     GFP_NOFS);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
			
 
				 				      struct scrub_copy_nocow_ctx *nocow_ctx)
			
 
				 {
			
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 
				 	struct inode *inode;
			
 
				 	struct page *page;
			
 
				 	struct btrfs_root *local_root;
			
 
				-	struct btrfs_ordered_extent *ordered;
			
 
				-	struct extent_map *em;
			
 
				-	struct extent_state *cached_state = NULL;
			
 
				 	struct extent_io_tree *io_tree;
			
 
				 	u64 physical_for_dev_replace;
			
 
				+	u64 nocow_ctx_logical;
			
 
				 	u64 len = nocow_ctx->len;
			
 
				-	u64 lockstart = offset, lockend = offset + len - 1;
			
 
				 	unsigned long index;
			
 
				 	int srcu_index;
			
 
				 	int ret = 0;
			
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 
				 
			
 
				 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
			
 
				 	io_tree = &BTRFS_I(inode)->io_tree;
			
 
				+	nocow_ctx_logical = nocow_ctx->logical;
			
 
				 
			
 
				-	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
			
 
				-	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
			
 
				-	if (ordered) {
			
 
				-		btrfs_put_ordered_extent(ordered);
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
			
 
				-	if (IS_ERR(em)) {
			
 
				-		ret = PTR_ERR(em);
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * This extent does not actually cover the logical extent anymore,
			
 
				-	 * move on to the next inode.
			
 
				-	 */
			
 
				-	if (em->block_start > nocow_ctx->logical ||
			
 
				-	    em->block_start + em->block_len < nocow_ctx->logical + len) {
			
 
				-		free_extent_map(em);
			
 
				-		goto out_unlock;
			
 
				+	ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
			
 
				+	if (ret) {
			
 
				+		ret = ret > 0 ? 0 : ret;
			
 
				+		goto out;
			
 
				 	}
			
 
				-	free_extent_map(em);
			
 
				 
			
 
				 	while (len >= PAGE_CACHE_SIZE) {
			
 
				 		index = offset >> PAGE_CACHE_SHIFT;
			
@@ -3396,7 +4159,7 @@ again:
 
				 				goto next_page;
			
 
				 		} else {
			
 
				 			ClearPageError(page);
			
 
				-			err = extent_read_full_page_nolock(io_tree, page,
			
 
				+			err = extent_read_full_page(io_tree, page,
			
 
				 							   btrfs_get_extent,
			
 
				 							   nocow_ctx->mirror_num);
			
 
				 			if (err) {
			
@@ -3421,6 +4184,14 @@ again:
 
				 				goto next_page;
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				+		ret = check_extent_to_block(inode, offset, len,
			
 
				+					    nocow_ctx_logical);
			
 
				+		if (ret) {
			
 
				+			ret = ret > 0 ? 0 : ret;
			
 
				+			goto next_page;
			
 
				+		}
			
 
				+
			
 
				 		err = write_page_nocow(nocow_ctx->sctx,
			
 
				 				       physical_for_dev_replace, page);
			
 
				 		if (err)
			
@@ -3434,12 +4205,10 @@ next_page:
 
				 
			
 
				 		offset += PAGE_CACHE_SIZE;
			
 
				 		physical_for_dev_replace += PAGE_CACHE_SIZE;
			
 
				+		nocow_ctx_logical += PAGE_CACHE_SIZE;
			
 
				 		len -= PAGE_CACHE_SIZE;
			
 
				 	}
			
 
				 	ret = COPY_COMPLETE;
			
 
				-out_unlock:
			
 
				-	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
			
 
				-			     GFP_NOFS);
			
 
				 out:
			
 
				 	mutex_unlock(&inode->i_mutex);
			
 
				 	iput(inode);
			
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * If orphan cleanup did remove any orphans from a root, it means the tree
			
 
				+ * was modified and therefore the commit root is not the same as the current
			
 
				+ * root anymore. This is a problem, because send uses the commit root and
			
 
				+ * therefore can see inode items that don't exist in the current root anymore,
			
 
				+ * and for example make calls to btrfs_iget, which will do tree lookups based
			
 
				+ * on the current root and not on the commit root. Those lookups will fail,
			
 
				+ * returning a -ESTALE error, and making send fail with that error. So make
			
 
				+ * sure a send does not see any orphans we have just removed, and that it will
			
 
				+ * see the same inodes regardless of whether a transaction commit happened
			
 
				+ * before it started (meaning that the commit root will be the same as the
			
 
				+ * current root) or not.
			
 
				+ */
			
 
				+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct btrfs_trans_handle *trans = NULL;
			
 
				+
			
 
				+again:
			
 
				+	if (sctx->parent_root &&
			
 
				+	    sctx->parent_root->node != sctx->parent_root->commit_root)
			
 
				+		goto commit_trans;
			
 
				+
			
 
				+	for (i = 0; i < sctx->clone_roots_cnt; i++)
			
 
				+		if (sctx->clone_roots[i].root->node !=
			
 
				+		    sctx->clone_roots[i].root->commit_root)
			
 
				+			goto commit_trans;
			
 
				+
			
 
				+	if (trans)
			
 
				+		return btrfs_end_transaction(trans, sctx->send_root);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+commit_trans:
			
 
				+	/* Use any root, all fs roots will get their commit roots updated. */
			
 
				+	if (!trans) {
			
 
				+		trans = btrfs_join_transaction(sctx->send_root);
			
 
				+		if (IS_ERR(trans))
			
 
				+			return PTR_ERR(trans);
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	return btrfs_commit_transaction(trans, sctx->send_root);
			
 
				+}
			
 
				+
			
 
				 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
			
 
				 {
			
 
				 	spin_lock(&root->root_item_lock);
			
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 
				 			NULL);
			
 
				 	sort_clone_roots = 1;
			
 
				 
			
 
				+	ret = ensure_commit_roots_uptodate(sctx);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				+
			
 
				 	current->journal_info = BTRFS_SEND_TRANS_STUB;
			
 
				 	ret = send_subvol(sctx);
			
 
				 	current->journal_info = NULL;
			
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 
				 	trans->aborted = errno;
			
 
				 	/* Nothing used. The other threads that have joined this
			
 
				 	 * transaction may be able to continue. */
			
 
				-	if (!trans->blocks_used) {
			
 
				+	if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
			
 
				 		const char *errstr;
			
 
				 
			
 
				 		errstr = btrfs_decode_error(errno);
			
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 					     "disabling disk space caching");
			
 
				 			break;
			
 
				 		case Opt_inode_cache:
			
 
				-			btrfs_set_and_info(root, CHANGE_INODE_CACHE,
			
 
				+			btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
			
 
				 					   "enabling inode map caching");
			
 
				 			break;
			
 
				 		case Opt_noinode_cache:
			
 
				-			btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
			
 
				+			btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
			
 
				 					     "disabling inode map caching");
			
 
				 			break;
			
 
				 		case Opt_clear_cache:
			
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
				 	trans = btrfs_attach_transaction_barrier(root);
			
 
				 	if (IS_ERR(trans)) {
			
 
				 		/* no transaction, don't bother */
			
 
				-		if (PTR_ERR(trans) == -ENOENT)
			
 
				-			return 0;
			
 
				-		return PTR_ERR(trans);
			
 
				+		if (PTR_ERR(trans) == -ENOENT) {
			
 
				+			/*
			
 
				+			 * Exit unless we have some pending changes
			
 
				+			 * that need to go through commit
			
 
				+			 */
			
 
				+			if (fs_info->pending_changes == 0)
			
 
				+				return 0;
			
 
				+			trans = btrfs_start_transaction(root, 0);
			
 
				+		} else {
			
 
				+			return PTR_ERR(trans);
			
 
				+		}
			
 
				 	}
			
 
				 	return btrfs_commit_transaction(trans, root);
			
 
				 }
			
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
				 	int i = 0, nr_devices;
			
 
				 	int ret;
			
 
				 
			
 
				+	/*
			
 
				+	 * We aren't under the device list lock, so this is racey-ish, but good
			
 
				+	 * enough for our purposes.
			
 
				+	 */
			
 
				 	nr_devices = fs_info->fs_devices->open_devices;
			
 
				-	BUG_ON(!nr_devices);
			
 
				+	if (!nr_devices) {
			
 
				+		smp_mb();
			
 
				+		nr_devices = fs_info->fs_devices->open_devices;
			
 
				+		ASSERT(nr_devices);
			
 
				+		if (!nr_devices) {
			
 
				+			*free_bytes = 0;
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
			
 
				 			       GFP_NOFS);
			
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
				 	else
			
 
				 		min_stripe_size = BTRFS_STRIPE_LEN;
			
 
				 
			
 
				-	list_for_each_entry(device, &fs_devices->devices, dev_list) {
			
 
				+	if (fs_info->alloc_start)
			
 
				+		mutex_lock(&fs_devices->device_list_mutex);
			
 
				+	rcu_read_lock();
			
 
				+	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
			
 
				 		if (!device->in_fs_metadata || !device->bdev ||
			
 
				 		    device->is_tgtdev_for_dev_replace)
			
 
				 			continue;
			
 
				 
			
 
				+		if (i >= nr_devices)
			
 
				+			break;
			
 
				+
			
 
				 		avail_space = device->total_bytes - device->bytes_used;
			
 
				 
			
 
				 		/* align with stripe_len */
			
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
				 		skip_space = 1024 * 1024;
			
 
				 
			
 
				 		/* user can set the offset in fs_info->alloc_start. */
			
 
				-		if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
			
 
				-		    device->total_bytes)
			
 
				+		if (fs_info->alloc_start &&
			
 
				+		    fs_info->alloc_start + BTRFS_STRIPE_LEN <=
			
 
				+		    device->total_bytes) {
			
 
				+			rcu_read_unlock();
			
 
				 			skip_space = max(fs_info->alloc_start, skip_space);
			
 
				 
			
 
				-		/*
			
 
				-		 * btrfs can not use the free space in [0, skip_space - 1],
			
 
				-		 * we must subtract it from the total. In order to implement
			
 
				-		 * it, we account the used space in this range first.
			
 
				-		 */
			
 
				-		ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
			
 
				-						     &used_space);
			
 
				-		if (ret) {
			
 
				-			kfree(devices_info);
			
 
				-			return ret;
			
 
				-		}
			
 
				+			/*
			
 
				+			 * btrfs can not use the free space in
			
 
				+			 * [0, skip_space - 1], we must subtract it from the
			
 
				+			 * total. In order to implement it, we account the used
			
 
				+			 * space in this range first.
			
 
				+			 */
			
 
				+			ret = btrfs_account_dev_extents_size(device, 0,
			
 
				+							     skip_space - 1,
			
 
				+							     &used_space);
			
 
				+			if (ret) {
			
 
				+				kfree(devices_info);
			
 
				+				mutex_unlock(&fs_devices->device_list_mutex);
			
 
				+				return ret;
			
 
				+			}
			
 
				 
			
 
				-		/* calc the free space in [0, skip_space - 1] */
			
 
				-		skip_space -= used_space;
			
 
				+			rcu_read_lock();
			
 
				+
			
 
				+			/* calc the free space in [0, skip_space - 1] */
			
 
				+			skip_space -= used_space;
			
 
				+		}
			
 
				 
			
 
				 		/*
			
 
				 		 * we can use the free space in [0, skip_space - 1], subtract
			
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
				 
			
 
				 		i++;
			
 
				 	}
			
 
				+	rcu_read_unlock();
			
 
				+	if (fs_info->alloc_start)
			
 
				+		mutex_unlock(&fs_devices->device_list_mutex);
			
 
				 
			
 
				 	nr_devices = i;
			
 
				 
			
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
				 	 * holding chunk_muext to avoid allocating new chunks, holding
			
 
				 	 * device_list_mutex to avoid the device being removed
			
 
				 	 */
			
 
				-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
			
 
				-	mutex_lock(&fs_info->chunk_mutex);
			
 
				 	rcu_read_lock();
			
 
				 	list_for_each_entry_rcu(found, head, list) {
			
 
				 		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
			
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
				 	buf->f_bfree -= block_rsv->size >> bits;
			
 
				 	spin_unlock(&block_rsv->lock);
			
 
				 
			
 
				-	buf->f_bavail = total_free_data;
			
 
				+	buf->f_bavail = div_u64(total_free_data, factor);
			
 
				 	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
			
 
				-	if (ret) {
			
 
				-		mutex_unlock(&fs_info->chunk_mutex);
			
 
				-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				+	if (ret)
			
 
				 		return ret;
			
 
				-	}
			
 
				 	buf->f_bavail += div_u64(total_free_data, factor);
			
 
				 	buf->f_bavail = buf->f_bavail >> bits;
			
 
				-	mutex_unlock(&fs_info->chunk_mutex);
			
 
				-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				 
			
 
				 	buf->f_type = BTRFS_SUPER_MAGIC;
			
 
				 	buf->f_bsize = dentry->d_sb->s_blocksize;
			
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
 
				 {
			
 
				 	struct btrfs_fs_info *fs_info;
			
 
				 	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
			
 
				-	struct btrfs_trans_handle *trans;
			
 
				 	u64 features, set, clear;
			
 
				 	unsigned long val;
			
 
				 	int ret;
			
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
 
				 	btrfs_info(fs_info, "%s %s feature flag",
			
 
				 		   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
			
 
				 
			
 
				-	trans = btrfs_start_transaction(fs_info->fs_root, 0);
			
 
				-	if (IS_ERR(trans))
			
 
				-		return PTR_ERR(trans);
			
 
				-
			
 
				 	spin_lock(&fs_info->super_lock);
			
 
				 	features = get_features(fs_info, fa->feature_set);
			
 
				 	if (val)
			
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
 
				 	set_features(fs_info, fa->feature_set, features);
			
 
				 	spin_unlock(&fs_info->super_lock);
			
 
				 
			
 
				-	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
			
 
				-	if (ret)
			
 
				-		return ret;
			
 
				+	/*
			
 
				+	 * We don't want to do full transaction commit from inside sysfs
			
 
				+	 */
			
 
				+	btrfs_set_pending(fs_info, COMMIT);
			
 
				+	wake_up_process(fs_info->transaction_kthread);
			
 
				 
			
 
				 	return count;
			
 
				 }
			
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
 
				 				 const char *buf, size_t len)
			
 
				 {
			
 
				 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
			
 
				-	struct btrfs_trans_handle *trans;
			
 
				-	struct btrfs_root *root = fs_info->fs_root;
			
 
				-	int ret;
			
 
				 	size_t p_len;
			
 
				 
			
 
				 	if (fs_info->sb->s_flags & MS_RDONLY)
			
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
 
				 	if (p_len >= BTRFS_LABEL_SIZE)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	trans = btrfs_start_transaction(root, 0);
			
 
				-	if (IS_ERR(trans))
			
 
				-		return PTR_ERR(trans);
			
 
				-
			
 
				-	spin_lock(&root->fs_info->super_lock);
			
 
				+	spin_lock(&fs_info->super_lock);
			
 
				 	memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
			
 
				 	memcpy(fs_info->super_copy->label, buf, p_len);
			
 
				-	spin_unlock(&root->fs_info->super_lock);
			
 
				-	ret = btrfs_commit_transaction(trans, root);
			
 
				+	spin_unlock(&fs_info->super_lock);
			
 
				 
			
 
				-	if (!ret)
			
 
				-		return len;
			
 
				+	/*
			
 
				+	 * We don't want to do full transaction commit from inside sysfs
			
 
				+	 */
			
 
				+	btrfs_set_pending(fs_info, COMMIT);
			
 
				+	wake_up_process(fs_info->transaction_kthread);
			
 
				 
			
 
				-	return ret;
			
 
				+	return len;
			
 
				 }
			
 
				 BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
			
 
				 
			
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void clear_btree_io_tree(struct extent_io_tree *tree)
			
 
				+{
			
 
				+	spin_lock(&tree->lock);
			
 
				+	while (!RB_EMPTY_ROOT(&tree->state)) {
			
 
				+		struct rb_node *node;
			
 
				+		struct extent_state *state;
			
 
				+
			
 
				+		node = rb_first(&tree->state);
			
 
				+		state = rb_entry(node, struct extent_state, rb_node);
			
 
				+		rb_erase(&state->rb_node, &tree->state);
			
 
				+		RB_CLEAR_NODE(&state->rb_node);
			
 
				+		/*
			
 
				+		 * btree io trees aren't supposed to have tasks waiting for
			
 
				+		 * changes in the flags of extent states ever.
			
 
				+		 */
			
 
				+		ASSERT(!waitqueue_active(&state->wq));
			
 
				+		free_extent_state(state);
			
 
				+		if (need_resched()) {
			
 
				+			spin_unlock(&tree->lock);
			
 
				+			cond_resched();
			
 
				+			spin_lock(&tree->lock);
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&tree->lock);
			
 
				+}
			
 
				+
			
 
				 static noinline void switch_commit_roots(struct btrfs_transaction *trans,
			
 
				 					 struct btrfs_fs_info *fs_info)
			
 
				 {
			
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
 
				 		root->commit_root = btrfs_root_node(root);
			
 
				 		if (is_fstree(root->objectid))
			
 
				 			btrfs_unpin_free_ino(root);
			
 
				+		clear_btree_io_tree(&root->dirty_log_pages);
			
 
				 	}
			
 
				 	up_write(&fs_info->commit_root_sem);
			
 
				 }
			
@@ -220,6 +247,7 @@ loop:
 
				 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
			
 
				 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
			
 
				 	INIT_LIST_HEAD(&cur_trans->switch_commits);
			
 
				+	INIT_LIST_HEAD(&cur_trans->pending_ordered);
			
 
				 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
			
 
				 	extent_io_tree_init(&cur_trans->dirty_pages,
			
 
				 			     fs_info->btree_inode->i_mapping);
			
@@ -488,6 +516,7 @@ again:
 
				 	h->sync = false;
			
 
				 	INIT_LIST_HEAD(&h->qgroup_ref_list);
			
 
				 	INIT_LIST_HEAD(&h->new_bgs);
			
 
				+	INIT_LIST_HEAD(&h->ordered);
			
 
				 
			
 
				 	smp_mb();
			
 
				 	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
			
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
				 	if (!list_empty(&trans->new_bgs))
			
 
				 		btrfs_create_pending_block_groups(trans, root);
			
 
				 
			
 
				+	if (!list_empty(&trans->ordered)) {
			
 
				+		spin_lock(&info->trans_lock);
			
 
				+		list_splice(&trans->ordered, &cur_trans->pending_ordered);
			
 
				+		spin_unlock(&info->trans_lock);
			
 
				+	}
			
 
				+
			
 
				 	trans->delayed_ref_updates = 0;
			
 
				 	if (!trans->sync) {
			
 
				 		must_run_delayed_refs =
			
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
				 
			
 
				 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
			
 
				 				      mark, &cached_state)) {
			
 
				-		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
			
 
				-				   mark, &cached_state, GFP_NOFS);
			
 
				-		cached_state = NULL;
			
 
				-		err = filemap_fdatawrite_range(mapping, start, end);
			
 
				+		bool wait_writeback = false;
			
 
				+
			
 
				+		err = convert_extent_bit(dirty_pages, start, end,
			
 
				+					 EXTENT_NEED_WAIT,
			
 
				+					 mark, &cached_state, GFP_NOFS);
			
 
				+		/*
			
 
				+		 * convert_extent_bit can return -ENOMEM, which is most of the
			
 
				+		 * time a temporary error. So when it happens, ignore the error
			
 
				+		 * and wait for writeback of this range to finish - because we
			
 
				+		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
			
 
				+		 * to btrfs_wait_marked_extents() would not know that writeback
			
 
				+		 * for this range started and therefore wouldn't wait for it to
			
 
				+		 * finish - we don't want to commit a superblock that points to
			
 
				+		 * btree nodes/leafs for which writeback hasn't finished yet
			
 
				+		 * (and without errors).
			
 
				+		 * We cleanup any entries left in the io tree when committing
			
 
				+		 * the transaction (through clear_btree_io_tree()).
			
 
				+		 */
			
 
				+		if (err == -ENOMEM) {
			
 
				+			err = 0;
			
 
				+			wait_writeback = true;
			
 
				+		}
			
 
				+		if (!err)
			
 
				+			err = filemap_fdatawrite_range(mapping, start, end);
			
 
				 		if (err)
			
 
				 			werr = err;
			
 
				+		else if (wait_writeback)
			
 
				+			werr = filemap_fdatawait_range(mapping, start, end);
			
 
				+		free_extent_state(cached_state);
			
 
				+		cached_state = NULL;
			
 
				 		cond_resched();
			
 
				 		start = end + 1;
			
 
				 	}
			
 
				-	if (err)
			
 
				-		werr = err;
			
 
				 	return werr;
			
 
				 }
			
 
				 
			
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 
				 
			
 
				 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
			
 
				 				      EXTENT_NEED_WAIT, &cached_state)) {
			
 
				-		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
			
 
				-				 0, 0, &cached_state, GFP_NOFS);
			
 
				-		err = filemap_fdatawait_range(mapping, start, end);
			
 
				+		/*
			
 
				+		 * Ignore -ENOMEM errors returned by clear_extent_bit().
			
 
				+		 * When committing the transaction, we'll remove any entries
			
 
				+		 * left in the io tree. For a log commit, we don't remove them
			
 
				+		 * after committing the log because the tree can be accessed
			
 
				+		 * concurrently - we do it only at transaction commit time when
			
 
				+		 * it's safe to do it (through clear_btree_io_tree()).
			
 
				+		 */
			
 
				+		err = clear_extent_bit(dirty_pages, start, end,
			
 
				+				       EXTENT_NEED_WAIT,
			
 
				+				       0, 0, &cached_state, GFP_NOFS);
			
 
				+		if (err == -ENOMEM)
			
 
				+			err = 0;
			
 
				+		if (!err)
			
 
				+			err = filemap_fdatawait_range(mapping, start, end);
			
 
				 		if (err)
			
 
				 			werr = err;
			
 
				+		free_extent_state(cached_state);
			
 
				+		cached_state = NULL;
			
 
				 		cond_resched();
			
 
				 		start = end + 1;
			
 
				 	}
			
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
			
 
				+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
			
 
				 				     struct btrfs_root *root)
			
 
				 {
			
 
				-	if (!trans || !trans->transaction) {
			
 
				-		struct inode *btree_inode;
			
 
				-		btree_inode = root->fs_info->btree_inode;
			
 
				-		return filemap_write_and_wait(btree_inode->i_mapping);
			
 
				-	}
			
 
				-	return btrfs_write_and_wait_marked_extents(root,
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = btrfs_write_and_wait_marked_extents(root,
			
 
				 					   &trans->transaction->dirty_pages,
			
 
				 					   EXTENT_DIRTY);
			
 
				+	clear_btree_io_tree(&trans->transaction->dirty_pages);
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 
				 		btrfs_wait_ordered_roots(fs_info, -1);
			
 
				 }
			
 
				 
			
 
				+static inline void
			
 
				+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
			
 
				+			   struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	struct btrfs_ordered_extent *ordered;
			
 
				+
			
 
				+	spin_lock(&fs_info->trans_lock);
			
 
				+	while (!list_empty(&cur_trans->pending_ordered)) {
			
 
				+		ordered = list_first_entry(&cur_trans->pending_ordered,
			
 
				+					   struct btrfs_ordered_extent,
			
 
				+					   trans_list);
			
 
				+		list_del_init(&ordered->trans_list);
			
 
				+		spin_unlock(&fs_info->trans_lock);
			
 
				+
			
 
				+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
			
 
				+						   &ordered->flags));
			
 
				+		btrfs_put_ordered_extent(ordered);
			
 
				+		spin_lock(&fs_info->trans_lock);
			
 
				+	}
			
 
				+	spin_unlock(&fs_info->trans_lock);
			
 
				+}
			
 
				+
			
 
				 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			
 
				 			     struct btrfs_root *root)
			
 
				 {
			
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	spin_lock(&root->fs_info->trans_lock);
			
 
				+	list_splice(&trans->ordered, &cur_trans->pending_ordered);
			
 
				 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
			
 
				 		spin_unlock(&root->fs_info->trans_lock);
			
 
				 		atomic_inc(&cur_trans->use_count);
			
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	btrfs_wait_delalloc_flush(root->fs_info);
			
 
				 
			
 
				+	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
			
 
				+
			
 
				 	btrfs_scrub_pause(root);
			
 
				 	/*
			
 
				 	 * Ok now we need to make sure to block out any other joins while we
			
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * Since the transaction is done, we should set the inode map cache flag
			
 
				-	 * before any other comming transaction.
			
 
				+	 * Since the transaction is done, we can apply the pending changes
			
 
				+	 * before the next transaction.
			
 
				 	 */
			
 
				-	if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
			
 
				-		btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
			
 
				-	else
			
 
				-		btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
			
 
				+	btrfs_apply_pending_changes(root->fs_info);
			
 
				 
			
 
				 	/* commit_fs_roots gets rid of all the tree log roots, it is now
			
 
				 	 * safe to free the root of tree log roots
			
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 
				 
			
 
				 	return (ret < 0) ? 0 : 1;
			
 
				 }
			
 
				+
			
 
				+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
			
 
				+{
			
 
				+	unsigned long prev;
			
 
				+	unsigned long bit;
			
 
				+
			
 
				+	prev = cmpxchg(&fs_info->pending_changes, 0, 0);
			
 
				+	if (!prev)
			
 
				+		return;
			
 
				+
			
 
				+	bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
			
 
				+	if (prev & bit)
			
 
				+		btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
			
 
				+	prev &= ~bit;
			
 
				+
			
 
				+	bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
			
 
				+	if (prev & bit)
			
 
				+		btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
			
 
				+	prev &= ~bit;
			
 
				+
			
 
				+	bit = 1 << BTRFS_PENDING_COMMIT;
			
 
				+	if (prev & bit)
			
 
				+		btrfs_debug(fs_info, "pending commit done");
			
 
				+	prev &= ~bit;
			
 
				+
			
 
				+	if (prev)
			
 
				+		btrfs_warn(fs_info,
			
 
				+			"unknown pending changes left 0x%lx, ignoring", prev);
			
 
				+}
			
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
 
				 	wait_queue_head_t commit_wait;
			
 
				 	struct list_head pending_snapshots;
			
 
				 	struct list_head pending_chunks;
			
 
				+	struct list_head pending_ordered;
			
 
				 	struct list_head switch_commits;
			
 
				 	struct btrfs_delayed_ref_root delayed_refs;
			
 
				 	int aborted;
			
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
 
				 	 */
			
 
				 	struct btrfs_root *root;
			
 
				 	struct seq_list delayed_ref_elem;
			
 
				+	struct list_head ordered;
			
 
				 	struct list_head qgroup_ref_list;
			
 
				 	struct list_head new_bgs;
			
 
				 };
			
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
 
				 					struct btrfs_root *root);
			
 
				 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
			
 
				 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
			
 
				-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
			
 
				-				     struct btrfs_root *root);
			
 
				 
			
 
				 void btrfs_add_dead_root(struct btrfs_root *root);
			
 
				 int btrfs_defrag_root(struct btrfs_root *root);
			
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 
				 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
			
 
				 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
			
 
				 void btrfs_put_transaction(struct btrfs_transaction *transaction);
			
 
				+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
			
 
				+
			
 
				 #endif
			
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
				 	index2 = root_log_ctx.log_transid % 2;
			
 
				 	if (atomic_read(&log_root_tree->log_commit[index2])) {
			
 
				 		blk_finish_plug(&plug);
			
 
				-		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
			
 
				+		ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
			
 
				+						mark);
			
 
				+		btrfs_wait_logged_extents(trans, log, log_transid);
			
 
				 		wait_log_commit(trans, log_root_tree,
			
 
				 				root_log_ctx.log_transid);
			
 
				-		btrfs_free_logged_extents(log, log_transid);
			
 
				 		mutex_unlock(&log_root_tree->log_mutex);
			
 
				-		ret = root_log_ctx.log_ret;
			
 
				+		if (!ret)
			
 
				+			ret = root_log_ctx.log_ret;
			
 
				 		goto out;
			
 
				 	}
			
 
				 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
			
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
				 		mutex_unlock(&log_root_tree->log_mutex);
			
 
				 		goto out_wake_log_root;
			
 
				 	}
			
 
				-	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
			
 
				-	btrfs_wait_marked_extents(log_root_tree,
			
 
				-				  &log_root_tree->dirty_log_pages,
			
 
				-				  EXTENT_NEW | EXTENT_DIRTY);
			
 
				-	btrfs_wait_logged_extents(log, log_transid);
			
 
				+	ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
			
 
				+	if (!ret)
			
 
				+		ret = btrfs_wait_marked_extents(log_root_tree,
			
 
				+						&log_root_tree->dirty_log_pages,
			
 
				+						EXTENT_NEW | EXTENT_DIRTY);
			
 
				+	if (ret) {
			
 
				+		btrfs_set_log_full_commit(root->fs_info, trans);
			
 
				+		btrfs_free_logged_extents(log, log_transid);
			
 
				+		mutex_unlock(&log_root_tree->log_mutex);
			
 
				+		goto out_wake_log_root;
			
 
				+	}
			
 
				+	btrfs_wait_logged_extents(trans, log, log_transid);
			
 
				 
			
 
				 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
			
 
				 				log_root_tree->node->start);
			
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
 
				 			    test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
			
 
				 
			
 
				 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
			
 
				+			/*
			
 
				+			 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
			
 
				+			 * i_mapping flags, so that the next fsync won't get
			
 
				+			 * an outdated io error too.
			
 
				+			 */
			
 
				+			btrfs_inode_check_errors(inode);
			
 
				 			*ordered_io_error = true;
			
 
				 			break;
			
 
				 		}
			
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 
				 	fi = btrfs_item_ptr(leaf, path->slots[0],
			
 
				 			    struct btrfs_file_extent_item);
			
 
				 
			
 
				-	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
			
 
				+	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
			
 
				 					       &token);
			
 
				 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
			
 
				 		btrfs_set_token_file_extent_type(leaf, fi,
			
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	mutex_lock(&BTRFS_I(inode)->log_mutex);
			
 
				 
			
 
				-	btrfs_get_logged_extents(inode, &logged_list);
			
 
				+	btrfs_get_logged_extents(inode, &logged_list, start, end);
			
 
				 
			
 
				 	/*
			
 
				 	 * a brute force approach to making sure we get the most uptodate
			
@@ -4089,6 +4104,21 @@ log_extents:
 
				 	btrfs_release_path(path);
			
 
				 	btrfs_release_path(dst_path);
			
 
				 	if (fast_search) {
			
 
				+		/*
			
 
				+		 * Some ordered extents started by fsync might have completed
			
 
				+		 * before we collected the ordered extents in logged_list, which
			
 
				+		 * means they're gone, not in our logged_list nor in the inode's
			
 
				+		 * ordered tree. We want the application/user space to know an
			
 
				+		 * error happened while attempting to persist file data so that
			
 
				+		 * it can take proper action. If such error happened, we leave
			
 
				+		 * without writing to the log tree and the fsync must report the
			
 
				+		 * file data write error and not commit the current transaction.
			
 
				+		 */
			
 
				+		err = btrfs_inode_check_errors(inode);
			
 
				+		if (err) {
			
 
				+			ctx->io_err = err;
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
			
 
				 						&logged_list, ctx);
			
 
				 		if (ret) {
			
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
				 DEFINE_MUTEX(uuid_mutex);
			
 
				 static LIST_HEAD(fs_uuids);
			
 
				 
			
 
				-static void lock_chunks(struct btrfs_root *root)
			
 
				-{
			
 
				-	mutex_lock(&root->fs_info->chunk_mutex);
			
 
				-}
			
 
				-
			
 
				-static void unlock_chunks(struct btrfs_root *root)
			
 
				-{
			
 
				-	mutex_unlock(&root->fs_info->chunk_mutex);
			
 
				-}
			
 
				-
			
 
				 static struct btrfs_fs_devices *__alloc_fs_devices(void)
			
 
				 {
			
 
				 	struct btrfs_fs_devices *fs_devs;
			
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
 
				 				   u64 *start, u64 len)
			
 
				 {
			
 
				 	struct extent_map *em;
			
 
				+	struct list_head *search_list = &trans->transaction->pending_chunks;
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
			
 
				+again:
			
 
				+	list_for_each_entry(em, search_list, list) {
			
 
				 		struct map_lookup *map;
			
 
				 		int i;
			
 
				 
			
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
 
				 			ret = 1;
			
 
				 		}
			
 
				 	}
			
 
				+	if (search_list == &trans->transaction->pending_chunks) {
			
 
				+		search_list = &trans->root->fs_info->pinned_chunks;
			
 
				+		goto again;
			
 
				+	}
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -1800,8 +1796,8 @@ error_undo:
 
				 	goto error_brelse;
			
 
				 }
			
 
				 
			
 
				-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
			
 
				-				 struct btrfs_device *srcdev)
			
 
				+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
			
 
				+					struct btrfs_device *srcdev)
			
 
				 {
			
 
				 	struct btrfs_fs_devices *fs_devices;
			
 
				 
			
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
 
				 
			
 
				 	if (srcdev->bdev)
			
 
				 		fs_devices->open_devices--;
			
 
				+}
			
 
				+
			
 
				+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
			
 
				+				      struct btrfs_device *srcdev)
			
 
				+{
			
 
				+	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
			
 
				 
			
 
				 	call_rcu(&srcdev->rcu, free_device);
			
 
				 
			
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
			
 
				+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
			
 
				 	if (ret) {
			
 
				 		btrfs_abort_transaction(trans, extent_root, ret);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	write_lock(&em_tree->lock);
			
 
				-	remove_extent_mapping(em_tree, em);
			
 
				-	write_unlock(&em_tree->lock);
			
 
				-
			
 
				-	/* once for the tree */
			
 
				-	free_extent_map(em);
			
 
				 out:
			
 
				 	/* once for us */
			
 
				 	free_extent_map(em);
			
@@ -4505,6 +4501,8 @@ error_del_extent:
 
				 	free_extent_map(em);
			
 
				 	/* One for the tree reference */
			
 
				 	free_extent_map(em);
			
 
				+	/* One for the pending_chunks list reference */
			
 
				+	free_extent_map(em);
			
 
				 error:
			
 
				 	kfree(devices_info);
			
 
				 	return ret;
			
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
 
				 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
			
 
				 {
			
 
				 	struct btrfs_bio_stripe s;
			
 
				+	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
			
 
				 	int i;
			
 
				 	u64 l;
			
 
				 	int again = 1;
			
 
				+	int m;
			
 
				 
			
 
				 	while (again) {
			
 
				 		again = 0;
			
 
				-		for (i = 0; i < bbio->num_stripes - 1; i++) {
			
 
				+		for (i = 0; i < real_stripes - 1; i++) {
			
 
				 			if (parity_smaller(raid_map[i], raid_map[i+1])) {
			
 
				 				s = bbio->stripes[i];
			
 
				 				l = raid_map[i];
			
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
 
				 				raid_map[i] = raid_map[i+1];
			
 
				 				bbio->stripes[i+1] = s;
			
 
				 				raid_map[i+1] = l;
			
 
				+
			
 
				+				if (bbio->tgtdev_map) {
			
 
				+					m = bbio->tgtdev_map[i];
			
 
				+					bbio->tgtdev_map[i] =
			
 
				+							bbio->tgtdev_map[i + 1];
			
 
				+					bbio->tgtdev_map[i + 1] = m;
			
 
				+				}
			
 
				+
			
 
				 				again = 1;
			
 
				 			}
			
 
				 		}
			
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 	int ret = 0;
			
 
				 	int num_stripes;
			
 
				 	int max_errors = 0;
			
 
				+	int tgtdev_indexes = 0;
			
 
				 	struct btrfs_bio *bbio = NULL;
			
 
				 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				 	int dev_replace_is_ongoing = 0;
			
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 				BTRFS_BLOCK_GROUP_RAID6)) {
			
 
				 		u64 tmp;
			
 
				 
			
 
				-		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
			
 
				-		    && raid_map_ret) {
			
 
				+		if (raid_map_ret &&
			
 
				+		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
			
 
				+		     mirror_num > 1)) {
			
 
				 			int i, rot;
			
 
				 
			
 
				 			/* push stripe_nr back to the start of the full stripe */
			
 
				 			stripe_nr = raid56_full_stripe_start;
			
 
				-			do_div(stripe_nr, stripe_len);
			
 
				-
			
 
				-			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
			
 
				+			do_div(stripe_nr, stripe_len * nr_data_stripes(map));
			
 
				 
			
 
				 			/* RAID[56] write or recovery. Return all stripes */
			
 
				 			num_stripes = map->num_stripes;
			
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 			num_alloc_stripes <<= 1;
			
 
				 		if (rw & REQ_GET_READ_MIRRORS)
			
 
				 			num_alloc_stripes++;
			
 
				+		tgtdev_indexes = num_stripes;
			
 
				 	}
			
 
				-	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
			
 
				+
			
 
				+	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
			
 
				+		       GFP_NOFS);
			
 
				 	if (!bbio) {
			
 
				 		kfree(raid_map);
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
 
				 	}
			
 
				 	atomic_set(&bbio->error, 0);
			
 
				+	if (dev_replace_is_ongoing)
			
 
				+		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
			
 
				 
			
 
				 	if (rw & REQ_DISCARD) {
			
 
				 		int factor = 0;
			
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
			
 
				 		max_errors = btrfs_chunk_max_errors(map);
			
 
				 
			
 
				+	tgtdev_indexes = 0;
			
 
				 	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
			
 
				 	    dev_replace->tgtdev != NULL) {
			
 
				 		int index_where_to_add;
			
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 				new->physical = old->physical;
			
 
				 				new->length = old->length;
			
 
				 				new->dev = dev_replace->tgtdev;
			
 
				+				bbio->tgtdev_map[i] = index_where_to_add;
			
 
				 				index_where_to_add++;
			
 
				 				max_errors++;
			
 
				+				tgtdev_indexes++;
			
 
				 			}
			
 
				 		}
			
 
				 		num_stripes = index_where_to_add;
			
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 				tgtdev_stripe->length =
			
 
				 					bbio->stripes[index_srcdev].length;
			
 
				 				tgtdev_stripe->dev = dev_replace->tgtdev;
			
 
				+				bbio->tgtdev_map[index_srcdev] = num_stripes;
			
 
				 
			
 
				+				tgtdev_indexes++;
			
 
				 				num_stripes++;
			
 
				 			}
			
 
				 		}
			
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 	bbio->num_stripes = num_stripes;
			
 
				 	bbio->max_errors = max_errors;
			
 
				 	bbio->mirror_num = mirror_num;
			
 
				+	bbio->num_tgtdevs = tgtdev_indexes;
			
 
				 
			
 
				 	/*
			
 
				 	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
			
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 				 mirror_num, NULL);
			
 
				 }
			
 
				 
			
 
				+/* For Scrub/replace */
			
 
				+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
			
 
				+		     u64 logical, u64 *length,
			
 
				+		     struct btrfs_bio **bbio_ret, int mirror_num,
			
 
				+		     u64 **raid_map_ret)
			
 
				+{
			
 
				+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
			
 
				+				 mirror_num, raid_map_ret);
			
 
				+}
			
 
				+
			
 
				 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
			
 
				 		     u64 chunk_start, u64 physical, u64 devid,
			
 
				 		     u64 **logical, int *naddrs, int *stripe_len)
			
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
				 		} else {
			
 
				 			ret = raid56_parity_recover(root, bio, bbio,
			
 
				 						    raid_map, map_length,
			
 
				-						    mirror_num);
			
 
				+						    mirror_num, 1);
			
 
				 		}
			
 
				-		/*
			
 
				-		 * FIXME, replace dosen't support raid56 yet, please fix
			
 
				-		 * it in the future.
			
 
				-		 */
			
 
				+
			
 
				 		btrfs_bio_counter_dec(root->fs_info);
			
 
				 		return ret;
			
 
				 	}
			
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
 
				 struct btrfs_bio;
			
 
				 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
			
 
				 
			
 
				-#define BTRFS_BIO_ORIG_BIO_SUBMITTED	0x1
			
 
				+#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
			
 
				 
			
 
				 struct btrfs_bio {
			
 
				 	atomic_t stripes_pending;
			
@@ -305,6 +305,8 @@ struct btrfs_bio {
 
				 	int max_errors;
			
 
				 	int num_stripes;
			
 
				 	int mirror_num;
			
 
				+	int num_tgtdevs;
			
 
				+	int *tgtdev_map;
			
 
				 	struct btrfs_bio_stripe stripes[];
			
 
				 };
			
 
				 
			
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
 
				 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
			
 
				 				   u64 end, u64 *length);
			
 
				 
			
 
				-#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
			
 
				-			    (sizeof(struct btrfs_bio_stripe) * (n)))
			
 
				+#define btrfs_bio_size(total_stripes, real_stripes)		\
			
 
				+	(sizeof(struct btrfs_bio) +				\
			
 
				+	 (sizeof(struct btrfs_bio_stripe) * (total_stripes)) +	\
			
 
				+	 (sizeof(int) * (real_stripes)))
			
 
				 
			
 
				 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
			
 
				 		    u64 logical, u64 *length,
			
 
				 		    struct btrfs_bio **bbio_ret, int mirror_num);
			
 
				+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
			
 
				+		     u64 logical, u64 *length,
			
 
				+		     struct btrfs_bio **bbio_ret, int mirror_num,
			
 
				+		     u64 **raid_map_ret);
			
 
				 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
			
 
				 		     u64 chunk_start, u64 physical, u64 devid,
			
 
				 		     u64 **logical, int *naddrs, int *stripe_len);
			
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 
				 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			
 
				 			struct btrfs_fs_info *fs_info);
			
 
				-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
			
 
				-				 struct btrfs_device *srcdev);
			
 
				+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
			
 
				+					struct btrfs_device *srcdev);
			
 
				+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
			
 
				+				      struct btrfs_device *srcdev);
			
 
				 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
			
 
				 				      struct btrfs_device *tgtdev);
			
 
				 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
			
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
 
				 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
			
 
				 void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
			
 
				 					struct btrfs_transaction *transaction);
			
 
				+
			
 
				+static inline void lock_chunks(struct btrfs_root *root)
			
 
				+{
			
 
				+	mutex_lock(&root->fs_info->chunk_mutex);
			
 
				+}
			
 
				+
			
 
				+static inline void unlock_chunks(struct btrfs_root *root)
			
 
				+{
			
 
				+	mutex_unlock(&root->fs_info->chunk_mutex);
			
 
				+}
			
 
				+
			
 
				+
			
 
				 #endif
			
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
 
				 #include "xattr.h"
			
 
				 #include "disk-io.h"
			
 
				 #include "props.h"
			
 
				+#include "locking.h"
			
 
				 
			
 
				 
			
 
				 ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
			
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 
				 		       struct inode *inode, const char *name,
			
 
				 		       const void *value, size_t size, int flags)
			
 
				 {
			
 
				-	struct btrfs_dir_item *di;
			
 
				+	struct btrfs_dir_item *di = NULL;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct btrfs_path *path;
			
 
				 	size_t name_len = strlen(name);
			
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				+	path->skip_release_on_error = 1;
			
 
				+
			
 
				+	if (!value) {
			
 
				+		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
			
 
				+					name, name_len, -1);
			
 
				+		if (!di && (flags & XATTR_REPLACE))
			
 
				+			ret = -ENODATA;
			
 
				+		else if (di)
			
 
				+			ret = btrfs_delete_one_dir_name(trans, root, path, di);
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				+	/*
			
 
				+	 * For a replace we can't just do the insert blindly.
			
 
				+	 * Do a lookup first (read-only btrfs_search_slot), and return if xattr
			
 
				+	 * doesn't exist. If it exists, fall down below to the insert/replace
			
 
				+	 * path - we can't race with a concurrent xattr delete, because the VFS
			
 
				+	 * locks the inode's i_mutex before calling setxattr or removexattr.
			
 
				+	 */
			
 
				 	if (flags & XATTR_REPLACE) {
			
 
				-		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
			
 
				-					name_len, -1);
			
 
				-		if (IS_ERR(di)) {
			
 
				-			ret = PTR_ERR(di);
			
 
				-			goto out;
			
 
				-		} else if (!di) {
			
 
				+		ASSERT(mutex_is_locked(&inode->i_mutex));
			
 
				+		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
			
 
				+					name, name_len, 0);
			
 
				+		if (!di) {
			
 
				 			ret = -ENODATA;
			
 
				 			goto out;
			
 
				 		}
			
 
				-		ret = btrfs_delete_one_dir_name(trans, root, path, di);
			
 
				-		if (ret)
			
 
				-			goto out;
			
 
				 		btrfs_release_path(path);
			
 
				+		di = NULL;
			
 
				+	}
			
 
				 
			
 
				+	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
			
 
				+				      name, name_len, value, size);
			
 
				+	if (ret == -EOVERFLOW) {
			
 
				 		/*
			
 
				-		 * remove the attribute
			
 
				+		 * We have an existing item in a leaf, split_leaf couldn't
			
 
				+		 * expand it. That item might have or not a dir_item that
			
 
				+		 * matches our target xattr, so lets check.
			
 
				 		 */
			
 
				-		if (!value)
			
 
				-			goto out;
			
 
				-	} else {
			
 
				-		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
			
 
				-					name, name_len, 0);
			
 
				-		if (IS_ERR(di)) {
			
 
				-			ret = PTR_ERR(di);
			
 
				+		ret = 0;
			
 
				+		btrfs_assert_tree_locked(path->nodes[0]);
			
 
				+		di = btrfs_match_dir_item_name(root, path, name, name_len);
			
 
				+		if (!di && !(flags & XATTR_REPLACE)) {
			
 
				+			ret = -ENOSPC;
			
 
				 			goto out;
			
 
				 		}
			
 
				-		if (!di && !value)
			
 
				-			goto out;
			
 
				-		btrfs_release_path(path);
			
 
				+	} else if (ret == -EEXIST) {
			
 
				+		ret = 0;
			
 
				+		di = btrfs_match_dir_item_name(root, path, name, name_len);
			
 
				+		ASSERT(di); /* logic error */
			
 
				+	} else if (ret) {
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				-again:
			
 
				-	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
			
 
				-				      name, name_len, value, size);
			
 
				-	/*
			
 
				-	 * If we're setting an xattr to a new value but the new value is say
			
 
				-	 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
			
 
				-	 * back from split_leaf.  This is because it thinks we'll be extending
			
 
				-	 * the existing item size, but we're asking for enough space to add the
			
 
				-	 * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
			
 
				-	 * the rest of the function figure it out.
			
 
				-	 */
			
 
				-	if (ret == -EOVERFLOW)
			
 
				+	if (di && (flags & XATTR_CREATE)) {
			
 
				 		ret = -EEXIST;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				-	if (ret == -EEXIST) {
			
 
				-		if (flags & XATTR_CREATE)
			
 
				-			goto out;
			
 
				+	if (di) {
			
 
				 		/*
			
 
				-		 * We can't use the path we already have since we won't have the
			
 
				-		 * proper locking for a delete, so release the path and
			
 
				-		 * re-lookup to delete the thing.
			
 
				+		 * We're doing a replace, and it must be atomic, that is, at
			
 
				+		 * any point in time we have either the old or the new xattr
			
 
				+		 * value in the tree. We don't want readers (getxattr and
			
 
				+		 * listxattrs) to miss a value, this is specially important
			
 
				+		 * for ACLs.
			
 
				 		 */
			
 
				-		btrfs_release_path(path);
			
 
				-		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
			
 
				-					name, name_len, -1);
			
 
				-		if (IS_ERR(di)) {
			
 
				-			ret = PTR_ERR(di);
			
 
				-			goto out;
			
 
				-		} else if (!di) {
			
 
				-			/* Shouldn't happen but just in case... */
			
 
				-			btrfs_release_path(path);
			
 
				-			goto again;
			
 
				+		const int slot = path->slots[0];
			
 
				+		struct extent_buffer *leaf = path->nodes[0];
			
 
				+		const u16 old_data_len = btrfs_dir_data_len(leaf, di);
			
 
				+		const u32 item_size = btrfs_item_size_nr(leaf, slot);
			
 
				+		const u32 data_size = sizeof(*di) + name_len + size;
			
 
				+		struct btrfs_item *item;
			
 
				+		unsigned long data_ptr;
			
 
				+		char *ptr;
			
 
				+
			
 
				+		if (size > old_data_len) {
			
 
				+			if (btrfs_leaf_free_space(root, leaf) <
			
 
				+			    (size - old_data_len)) {
			
 
				+				ret = -ENOSPC;
			
 
				+				goto out;
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				-		ret = btrfs_delete_one_dir_name(trans, root, path, di);
			
 
				-		if (ret)
			
 
				-			goto out;
			
 
				+		if (old_data_len + name_len + sizeof(*di) == item_size) {
			
 
				+			/* No other xattrs packed in the same leaf item. */
			
 
				+			if (size > old_data_len)
			
 
				+				btrfs_extend_item(root, path,
			
 
				+						  size - old_data_len);
			
 
				+			else if (size < old_data_len)
			
 
				+				btrfs_truncate_item(root, path, data_size, 1);
			
 
				+		} else {
			
 
				+			/* There are other xattrs packed in the same item. */
			
 
				+			ret = btrfs_delete_one_dir_name(trans, root, path, di);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+			btrfs_extend_item(root, path, data_size);
			
 
				+		}
			
 
				 
			
 
				+		item = btrfs_item_nr(slot);
			
 
				+		ptr = btrfs_item_ptr(leaf, slot, char);
			
 
				+		ptr += btrfs_item_size(leaf, item) - data_size;
			
 
				+		di = (struct btrfs_dir_item *)ptr;
			
 
				+		btrfs_set_dir_data_len(leaf, di, size);
			
 
				+		data_ptr = ((unsigned long)(di + 1)) + name_len;
			
 
				+		write_extent_buffer(leaf, value, data_ptr, size);
			
 
				+		btrfs_mark_buffer_dirty(leaf);
			
 
				+	} else {
			
 
				 		/*
			
 
				-		 * We have a value to set, so go back and try to insert it now.
			
 
				+		 * Insert, and we had space for the xattr, so path->slots[0] is
			
 
				+		 * where our xattr dir_item is and btrfs_insert_xattr_item()
			
 
				+		 * filled it.
			
 
				 		 */
			
 
				-		if (value) {
			
 
				-			btrfs_release_path(path);
			
 
				-			goto again;
			
 
				-		}
			
 
				 	}
			
 
				 out:
			
 
				 	btrfs_free_path(path);
			
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -157,6 +157,7 @@ struct btrfs_ioctl_dev_replace_status_params {
 
				 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0
			
 
				 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1
			
 
				 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2
			
 
				+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS		3
			
 
				 struct btrfs_ioctl_dev_replace_args {
			
 
				 	__u64 cmd;	/* in */
			
 
				 	__u64 result;	/* out */