9 лет назад · 968f3e374f
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -1,20 +1,10 @@
 
				-
			
 
				 BTRFS
			
 
				 =====
			
 
				 
			
 
				-Btrfs is a copy on write filesystem for Linux aimed at
			
 
				-implementing advanced features while focusing on fault tolerance,
			
 
				-repair and easy administration. Initially developed by Oracle, Btrfs
			
 
				-is licensed under the GPL and open for contribution from anyone.
			
 
				-
			
 
				-Linux has a wealth of filesystems to choose from, but we are facing a
			
 
				-number of challenges with scaling to the large storage subsystems that
			
 
				-are becoming common in today's data centers. Filesystems need to scale
			
 
				-in their ability to address and manage large storage, and also in
			
 
				-their ability to detect, repair and tolerate errors in the data stored
			
 
				-on disk.  Btrfs is under heavy development, and is not suitable for
			
 
				-any uses other than benchmarking and review. The Btrfs disk format is
			
 
				-not yet finalized.
			
 
				+Btrfs is a copy on write filesystem for Linux aimed at implementing advanced
			
 
				+features while focusing on fault tolerance, repair and easy administration.
			
 
				+Jointly developed by several companies, licensed under the GPL and open for
			
 
				+contribution from anyone.
			
 
				 
			
 
				 The main Btrfs features include:
			
 
				 
			
@@ -28,243 +18,14 @@ The main Btrfs features include:
 
				     * Checksums on data and metadata (multiple algorithms available)
			
 
				     * Compression
			
 
				     * Integrated multiple device support, with several raid algorithms
			
 
				-    * Online filesystem check (not yet implemented)
			
 
				-    * Very fast offline filesystem check
			
 
				-    * Efficient incremental backup and FS mirroring (not yet implemented)
			
 
				+    * Offline filesystem check
			
 
				+    * Efficient incremental backup and FS mirroring
			
 
				     * Online filesystem defragmentation
			
 
				 
			
 
				+For more information please refer to the wiki
			
 
				 
			
 
				-Mount Options
			
 
				-=============
			
 
				-
			
 
				-When mounting a btrfs filesystem, the following option are accepted.
			
 
				-Options with (*) are default options and will not show in the mount options.
			
 
				-
			
 
				-  alloc_start=<bytes>
			
 
				-	Debugging option to force all block allocations above a certain
			
 
				-	byte threshold on each block device.  The value is specified in
			
 
				-	bytes, optionally with a K, M, or G suffix, case insensitive.
			
 
				-	Default is 1MB.
			
 
				-
			
 
				-  noautodefrag(*)
			
 
				-  autodefrag
			
 
				-	Disable/enable auto defragmentation.
			
 
				-	Auto defragmentation detects small random writes into files and queue
			
 
				-	them up for the defrag process.  Works best for small files;
			
 
				-	Not well suited for large database workloads.
			
 
				-
			
 
				-  check_int
			
 
				-  check_int_data
			
 
				-  check_int_print_mask=<value>
			
 
				-	These debugging options control the behavior of the integrity checking
			
 
				-	module (the BTRFS_FS_CHECK_INTEGRITY config option required).
			
 
				-
			
 
				-	check_int enables the integrity checker module, which examines all
			
 
				-	block write requests to ensure on-disk consistency, at a large
			
 
				-	memory and CPU cost.
			
 
				-
			
 
				-	check_int_data includes extent data in the integrity checks, and
			
 
				-	implies the check_int option.
			
 
				-
			
 
				-	check_int_print_mask takes a bitmask of BTRFSIC_PRINT_MASK_* values
			
 
				-	as defined in fs/btrfs/check-integrity.c, to control the integrity
			
 
				-	checker module behavior.
			
 
				-
			
 
				-	See comments at the top of fs/btrfs/check-integrity.c for more info.
			
 
				-
			
 
				-  commit=<seconds>
			
 
				-	Set the interval of periodic commit, 30 seconds by default. Higher
			
 
				-	values defer data being synced to permanent storage with obvious
			
 
				-	consequences when the system crashes. The upper bound is not forced,
			
 
				-	but a warning is printed if it's more than 300 seconds (5 minutes).
			
 
				-
			
 
				-  compress
			
 
				-  compress=<type>
			
 
				-  compress-force
			
 
				-  compress-force=<type>
			
 
				-	Control BTRFS file data compression.  Type may be specified as "zlib"
			
 
				-	"lzo" or "no" (for no compression, used for remounting).  If no type
			
 
				-	is specified, zlib is used.  If compress-force is specified,
			
 
				-	all files will be compressed, whether or not they compress well.
			
 
				-	If compression is enabled, nodatacow and nodatasum are disabled.
			
 
				-
			
 
				-  degraded
			
 
				-	Allow mounts to continue with missing devices.  A read-write mount may
			
 
				-	fail with too many devices missing, for example if a stripe member
			
 
				-	is completely missing.
			
 
				-
			
 
				-  device=<devicepath>
			
 
				-	Specify a device during mount so that ioctls on the control device
			
 
				-	can be avoided.  Especially useful when trying to mount a multi-device
			
 
				-	setup as root.  May be specified multiple times for multiple devices.
			
 
				-
			
 
				-  nodiscard(*)
			
 
				-  discard
			
 
				-	Disable/enable discard mount option.
			
 
				-	Discard issues frequent commands to let the block device reclaim space
			
 
				-	freed by the filesystem.
			
 
				-	This is useful for SSD devices, thinly provisioned
			
 
				-	LUNs and virtual machine images, but may have a significant
			
 
				-	performance impact.  (The fstrim command is also available to
			
 
				-	initiate batch trims from userspace).
			
 
				-
			
 
				-  noenospc_debug(*)
			
 
				-  enospc_debug
			
 
				-	Disable/enable debugging option to be more verbose in some ENOSPC conditions.
			
 
				-
			
 
				-  fatal_errors=<action>
			
 
				-	Action to take when encountering a fatal error:
			
 
				-	  "bug" - BUG() on a fatal error.  This is the default.
			
 
				-	  "panic" - panic() on a fatal error.
			
 
				-
			
 
				-  noflushoncommit(*)
			
 
				-  flushoncommit
			
 
				-	The 'flushoncommit' mount option forces any data dirtied by a write in a
			
 
				-	prior transaction to commit as part of the current commit.  This makes
			
 
				-	the committed state a fully consistent view of the file system from the
			
 
				-	application's perspective (i.e., it includes all completed file system
			
 
				-	operations).  This was previously the behavior only when a snapshot is
			
 
				-	created.
			
 
				-
			
 
				-  inode_cache
			
 
				-	Enable free inode number caching.   Defaults to off due to an overflow
			
 
				-	problem when the free space crcs don't fit inside a single page.
			
 
				-
			
 
				-  max_inline=<bytes>
			
 
				-	Specify the maximum amount of space, in bytes, that can be inlined in
			
 
				-	a metadata B-tree leaf.  The value is specified in bytes, optionally
			
 
				-	with a K, M, or G suffix, case insensitive.  In practice, this value
			
 
				-	is limited by the root sector size, with some space unavailable due
			
 
				-	to leaf headers.  For a 4k sector size, max inline data is ~3900 bytes.
			
 
				-
			
 
				-  metadata_ratio=<value>
			
 
				-	Specify that 1 metadata chunk should be allocated after every <value>
			
 
				-	data chunks.  Off by default.
			
 
				-
			
 
				-  acl(*)
			
 
				-  noacl
			
 
				-	Enable/disable support for Posix Access Control Lists (ACLs).  See the
			
 
				-	acl(5) manual page for more information about ACLs.
			
 
				-
			
 
				-  barrier(*)
			
 
				-  nobarrier
			
 
				-        Enable/disable the use of block layer write barriers.  Write barriers
			
 
				-	ensure that certain IOs make it through the device cache and are on
			
 
				-	persistent storage. If disabled on a device with a volatile
			
 
				-	(non-battery-backed) write-back cache, nobarrier option will lead to
			
 
				-	filesystem corruption on a system crash or power loss.
			
 
				-
			
 
				-  datacow(*)
			
 
				-  nodatacow
			
 
				-	Enable/disable data copy-on-write for newly created files.
			
 
				-	Nodatacow implies nodatasum, and disables all compression.
			
 
				-
			
 
				-  datasum(*)
			
 
				-  nodatasum
			
 
				-	Enable/disable data checksumming for newly created files.
			
 
				-	Datasum implies datacow.
			
 
				-
			
 
				-  treelog(*)
			
 
				-  notreelog
			
 
				-	Enable/disable the tree logging used for fsync and O_SYNC writes.
			
 
				-
			
 
				-  recovery
			
 
				-	Enable autorecovery attempts if a bad tree root is found at mount time.
			
 
				-	Currently this scans a list of several previous tree roots and tries to
			
 
				-	use the first readable.
			
 
				-
			
 
				-  rescan_uuid_tree
			
 
				-	Force check and rebuild procedure of the UUID tree. This should not
			
 
				-	normally be needed.
			
 
				-
			
 
				-  skip_balance
			
 
				-	Skip automatic resume of interrupted balance operation after mount.
			
 
				-	May be resumed with "btrfs balance resume."
			
 
				-
			
 
				-  space_cache (*)
			
 
				-	Enable the on-disk freespace cache.
			
 
				-  nospace_cache
			
 
				-	Disable freespace cache loading without clearing the cache.
			
 
				-  clear_cache
			
 
				-	Force clearing and rebuilding of the disk space cache if something
			
 
				-	has gone wrong.
			
 
				-
			
 
				-  ssd
			
 
				-  nossd
			
 
				-  ssd_spread
			
 
				-	Options to control ssd allocation schemes.  By default, BTRFS will
			
 
				-	enable or disable ssd allocation heuristics depending on whether a
			
 
				-	rotational or non-rotational disk is in use.  The ssd and nossd options
			
 
				-	can override this autodetection.
			
 
				-
			
 
				-	The ssd_spread mount option attempts to allocate into big chunks
			
 
				-	of unused space, and may perform better on low-end ssds.  ssd_spread
			
 
				-	implies ssd, enabling all other ssd heuristics as well.
			
 
				-
			
 
				-  subvol=<path>
			
 
				-	Mount subvolume at <path> rather than the root subvolume.  <path> is
			
 
				-	relative to the top level subvolume.
			
 
				-
			
 
				-  subvolid=<ID>
			
 
				-	Mount subvolume specified by an ID number rather than the root subvolume.
			
 
				-	This allows mounting of subvolumes which are not in the root of the mounted
			
 
				-	filesystem.
			
 
				-	You can use "btrfs subvolume list" to see subvolume ID numbers.
			
 
				-
			
 
				-  subvolrootid=<objectid> (deprecated)
			
 
				-	Mount subvolume specified by <objectid> rather than the root subvolume.
			
 
				-	This allows mounting of subvolumes which are not in the root of the mounted
			
 
				-	filesystem.
			
 
				-	You can use "btrfs subvolume show " to see the object ID for a subvolume.
			
 
				-
			
 
				-  thread_pool=<number>
			
 
				-	The number of worker threads to allocate.  The default number is equal
			
 
				-	to the number of CPUs + 2, or 8, whichever is smaller.
			
 
				-
			
 
				-  user_subvol_rm_allowed
			
 
				-	Allow subvolumes to be deleted by a non-root user. Use with caution.
			
 
				-
			
 
				-MAILING LIST
			
 
				-============
			
 
				-
			
 
				-There is a Btrfs mailing list hosted on vger.kernel.org. You can
			
 
				-find details on how to subscribe here:
			
 
				-
			
 
				-http://vger.kernel.org/vger-lists.html#linux-btrfs
			
 
				-
			
 
				-Mailing list archives are available from gmane:
			
 
				-
			
 
				-http://dir.gmane.org/gmane.comp.file-systems.btrfs
			
 
				-
			
 
				-
			
 
				-
			
 
				-IRC
			
 
				-===
			
 
				-
			
 
				-Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
			
 
				-IRC network.
			
 
				-
			
 
				-
			
 
				-
			
 
				-	UTILITIES
			
 
				-	=========
			
 
				-
			
 
				-Userspace tools for creating and manipulating Btrfs file systems are
			
 
				-available from the git repository at the following location:
			
 
				-
			
 
				- http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs.git
			
 
				- git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs.git
			
 
				-
			
 
				-These include the following tools:
			
 
				-
			
 
				-* mkfs.btrfs: create a filesystem
			
 
				-
			
 
				-* btrfs: a single tool to manage the filesystems, refer to the manpage for more details
			
 
				-
			
 
				-* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem
			
 
				-
			
 
				-Other tools for specific tasks:
			
 
				-
			
 
				-* btrfs-convert: in-place conversion from ext2/3/4 filesystems
			
 
				+  https://btrfs.wiki.kernel.org
			
 
				 
			
 
				-* btrfs-image: dump filesystem metadata for debugging
			
 
				+that maintains information about administration tasks, frequently asked
			
 
				+questions, use cases, mount options, comprehensible changelogs, features,
			
 
				+manual pages, source code repositories, contacts etc.
			
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -148,8 +148,7 @@ int __init btrfs_prelim_ref_init(void)
 
				 
			
 
				 void btrfs_prelim_ref_exit(void)
			
 
				 {
			
 
				-	if (btrfs_prelim_ref_cache)
			
 
				-		kmem_cache_destroy(btrfs_prelim_ref_cache);
			
 
				+	kmem_cache_destroy(btrfs_prelim_ref_cache);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -566,17 +565,14 @@ static void __merge_refs(struct list_head *head, int mode)
 
				 		struct __prelim_ref *pos2 = pos1, *tmp;
			
 
				 
			
 
				 		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
			
 
				-			struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;
			
 
				+			struct __prelim_ref *ref1 = pos1, *ref2 = pos2;
			
 
				 			struct extent_inode_elem *eie;
			
 
				 
			
 
				 			if (!ref_for_same_block(ref1, ref2))
			
 
				 				continue;
			
 
				 			if (mode == 1) {
			
 
				-				if (!ref1->parent && ref2->parent) {
			
 
				-					xchg = ref1;
			
 
				-					ref1 = ref2;
			
 
				-					ref2 = xchg;
			
 
				-				}
			
 
				+				if (!ref1->parent && ref2->parent)
			
 
				+					swap(ref1, ref2);
			
 
				 			} else {
			
 
				 				if (ref1->parent != ref2->parent)
			
 
				 					continue;
			
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -95,6 +95,7 @@
 
				 #include <linux/genhd.h>
			
 
				 #include <linux/blkdev.h>
			
 
				 #include <linux/vmalloc.h>
			
 
				+#include <linux/string.h>
			
 
				 #include "ctree.h"
			
 
				 #include "disk-io.h"
			
 
				 #include "hash.h"
			
@@ -105,6 +106,7 @@
 
				 #include "locking.h"
			
 
				 #include "check-integrity.h"
			
 
				 #include "rcu-string.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
			
 
				 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
			
@@ -176,7 +178,7 @@ struct btrfsic_block {
 
				  * Elements of this type are allocated dynamically and required because
			
 
				  * each block object can refer to and can be ref from multiple blocks.
			
 
				  * The key to lookup them in the hashtable is the dev_bytenr of
			
 
				- * the block ref to plus the one from the block refered from.
			
 
				+ * the block ref to plus the one from the block referred from.
			
 
				  * The fact that they are searchable via a hashtable and that a
			
 
				  * ref_cnt is maintained is not required for the btrfs integrity
			
 
				  * check algorithm itself, it is only used to make the output more
			
@@ -3076,7 +3078,7 @@ int btrfsic_mount(struct btrfs_root *root,
 
				 
			
 
				 	list_for_each_entry(device, dev_head, dev_list) {
			
 
				 		struct btrfsic_dev_state *ds;
			
 
				-		char *p;
			
 
				+		const char *p;
			
 
				 
			
 
				 		if (!device->bdev || !device->name)
			
 
				 			continue;
			
@@ -3092,11 +3094,7 @@ int btrfsic_mount(struct btrfs_root *root,
 
				 		ds->state = state;
			
 
				 		bdevname(ds->bdev, ds->name);
			
 
				 		ds->name[BDEVNAME_SIZE - 1] = '\0';
			
 
				-		for (p = ds->name; *p != '\0'; p++);
			
 
				-		while (p > ds->name && *p != '/')
			
 
				-			p--;
			
 
				-		if (*p == '/')
			
 
				-			p++;
			
 
				+		p = kbasename(ds->name);
			
 
				 		strlcpy(ds->name, p, sizeof(ds->name));
			
 
				 		btrfsic_dev_state_hashtable_add(ds,
			
 
				 						&btrfsic_dev_state_hashtable);
			
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
				 void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
			
 
				 				   unsigned long pg_index,
			
 
				 				   unsigned long pg_offset);
			
 
				+
			
 
				+enum btrfs_compression_type {
			
 
				+	BTRFS_COMPRESS_NONE  = 0,
			
 
				+	BTRFS_COMPRESS_ZLIB  = 1,
			
 
				+	BTRFS_COMPRESS_LZO   = 2,
			
 
				+	BTRFS_COMPRESS_TYPES = 2,
			
 
				+	BTRFS_COMPRESS_LAST  = 3,
			
 
				+};
			
 
				+
			
 
				 struct btrfs_compress_op {
			
 
				 	struct list_head *(*alloc_workspace)(void);
			
 
				 
			
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -311,7 +311,7 @@ struct tree_mod_root {
 
				 
			
 
				 struct tree_mod_elem {
			
 
				 	struct rb_node node;
			
 
				-	u64 index;		/* shifted logical */
			
 
				+	u64 logical;
			
 
				 	u64 seq;
			
 
				 	enum mod_log_op op;
			
 
				 
			
@@ -435,11 +435,11 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 
				 
			
 
				 /*
			
 
				  * key order of the log:
			
 
				- *       index -> sequence
			
 
				+ *       node/leaf start address -> sequence
			
 
				  *
			
 
				- * the index is the shifted logical of the *new* root node for root replace
			
 
				- * operations, or the shifted logical of the affected block for all other
			
 
				- * operations.
			
 
				+ * The 'start address' is the logical address of the *new* root node
			
 
				+ * for root replace operations, or the logical address of the affected
			
 
				+ * block for all other operations.
			
 
				  *
			
 
				  * Note: must be called with write lock (tree_mod_log_write_lock).
			
 
				  */
			
@@ -460,9 +460,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 
				 	while (*new) {
			
 
				 		cur = container_of(*new, struct tree_mod_elem, node);
			
 
				 		parent = *new;
			
 
				-		if (cur->index < tm->index)
			
 
				+		if (cur->logical < tm->logical)
			
 
				 			new = &((*new)->rb_left);
			
 
				-		else if (cur->index > tm->index)
			
 
				+		else if (cur->logical > tm->logical)
			
 
				 			new = &((*new)->rb_right);
			
 
				 		else if (cur->seq < tm->seq)
			
 
				 			new = &((*new)->rb_left);
			
@@ -523,7 +523,7 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 
				 	if (!tm)
			
 
				 		return NULL;
			
 
				 
			
 
				-	tm->index = eb->start >> PAGE_CACHE_SHIFT;
			
 
				+	tm->logical = eb->start;
			
 
				 	if (op != MOD_LOG_KEY_ADD) {
			
 
				 		btrfs_node_key(eb, &tm->key, slot);
			
 
				 		tm->blockptr = btrfs_node_blockptr(eb, slot);
			
@@ -588,7 +588,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 
				 		goto free_tms;
			
 
				 	}
			
 
				 
			
 
				-	tm->index = eb->start >> PAGE_CACHE_SHIFT;
			
 
				+	tm->logical = eb->start;
			
 
				 	tm->slot = src_slot;
			
 
				 	tm->move.dst_slot = dst_slot;
			
 
				 	tm->move.nr_items = nr_items;
			
@@ -699,7 +699,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 
				 		goto free_tms;
			
 
				 	}
			
 
				 
			
 
				-	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
			
 
				+	tm->logical = new_root->start;
			
 
				 	tm->old_root.logical = old_root->start;
			
 
				 	tm->old_root.level = btrfs_header_level(old_root);
			
 
				 	tm->generation = btrfs_header_generation(old_root);
			
@@ -739,16 +739,15 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 
				 	struct rb_node *node;
			
 
				 	struct tree_mod_elem *cur = NULL;
			
 
				 	struct tree_mod_elem *found = NULL;
			
 
				-	u64 index = start >> PAGE_CACHE_SHIFT;
			
 
				 
			
 
				 	tree_mod_log_read_lock(fs_info);
			
 
				 	tm_root = &fs_info->tree_mod_log;
			
 
				 	node = tm_root->rb_node;
			
 
				 	while (node) {
			
 
				 		cur = container_of(node, struct tree_mod_elem, node);
			
 
				-		if (cur->index < index) {
			
 
				+		if (cur->logical < start) {
			
 
				 			node = node->rb_left;
			
 
				-		} else if (cur->index > index) {
			
 
				+		} else if (cur->logical > start) {
			
 
				 			node = node->rb_right;
			
 
				 		} else if (cur->seq < min_seq) {
			
 
				 			node = node->rb_left;
			
@@ -1230,9 +1229,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 
				 		return NULL;
			
 
				 
			
 
				 	/*
			
 
				-	 * the very last operation that's logged for a root is the replacement
			
 
				-	 * operation (if it is replaced at all). this has the index of the *new*
			
 
				-	 * root, making it the very first operation that's logged for this root.
			
 
				+	 * the very last operation that's logged for a root is the
			
 
				+	 * replacement operation (if it is replaced at all). this has
			
 
				+	 * the logical address of the *new* root, making it the very
			
 
				+	 * first operation that's logged for this root.
			
 
				 	 */
			
 
				 	while (1) {
			
 
				 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
			
@@ -1336,7 +1336,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 
				 		if (!next)
			
 
				 			break;
			
 
				 		tm = container_of(next, struct tree_mod_elem, node);
			
 
				-		if (tm->index != first_tm->index)
			
 
				+		if (tm->logical != first_tm->logical)
			
 
				 			break;
			
 
				 	}
			
 
				 	tree_mod_log_read_unlock(fs_info);
			
@@ -5361,7 +5361,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
			
 
				+	tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL);
			
 
				 	if (!tmp_buf) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -100,6 +100,9 @@ struct btrfs_ordered_sum;
 
				 /* tracks free space in block groups. */
			
 
				 #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
			
 
				 
			
 
				+/* device stats in the device tree */
			
 
				+#define BTRFS_DEV_STATS_OBJECTID 0ULL
			
 
				+
			
 
				 /* for storing balance parameters in the root tree */
			
 
				 #define BTRFS_BALANCE_OBJECTID -4ULL
			
 
				 
			
@@ -715,14 +718,6 @@ struct btrfs_timespec {
 
				 	__le32 nsec;
			
 
				 } __attribute__ ((__packed__));
			
 
				 
			
 
				-enum btrfs_compression_type {
			
 
				-	BTRFS_COMPRESS_NONE  = 0,
			
 
				-	BTRFS_COMPRESS_ZLIB  = 1,
			
 
				-	BTRFS_COMPRESS_LZO   = 2,
			
 
				-	BTRFS_COMPRESS_TYPES = 2,
			
 
				-	BTRFS_COMPRESS_LAST  = 3,
			
 
				-};
			
 
				-
			
 
				 struct btrfs_inode_item {
			
 
				 	/* nfs style generation number */
			
 
				 	__le64 generation;
			
@@ -793,7 +788,7 @@ struct btrfs_root_item {
 
				 
			
 
				 	/*
			
 
				 	 * This generation number is used to test if the new fields are valid
			
 
				-	 * and up to date while reading the root item. Everytime the root item
			
 
				+	 * and up to date while reading the root item. Every time the root item
			
 
				 	 * is written out, the "generation" field is copied into this field. If
			
 
				 	 * anyone ever mounted the fs with an older kernel, we will have
			
 
				 	 * mismatching generation values here and thus must invalidate the
			
@@ -1002,8 +997,10 @@ struct btrfs_dev_replace {
 
				 	pid_t lock_owner;
			
 
				 	atomic_t nesting_level;
			
 
				 	struct mutex lock_finishing_cancel_unmount;
			
 
				-	struct mutex lock_management_lock;
			
 
				-	struct mutex lock;
			
 
				+	rwlock_t lock;
			
 
				+	atomic_t read_locks;
			
 
				+	atomic_t blocking_readers;
			
 
				+	wait_queue_head_t read_lock_wq;
			
 
				 
			
 
				 	struct btrfs_scrub_progress scrub_progress;
			
 
				 };
			
@@ -1222,10 +1219,10 @@ struct btrfs_space_info {
 
				 	 * we've called update_block_group and dropped the bytes_used counter
			
 
				 	 * and increased the bytes_pinned counter.  However this means that
			
 
				 	 * bytes_pinned does not reflect the bytes that will be pinned once the
			
 
				-	 * delayed refs are flushed, so this counter is inc'ed everytime we call
			
 
				-	 * btrfs_free_extent so it is a realtime count of what will be freed
			
 
				-	 * once the transaction is committed.  It will be zero'ed everytime the
			
 
				-	 * transaction commits.
			
 
				+	 * delayed refs are flushed, so this counter is inc'ed every time we
			
 
				+	 * call btrfs_free_extent so it is a realtime count of what will be
			
 
				+	 * freed once the transaction is committed.  It will be zero'ed every
			
 
				+	 * time the transaction commits.
			
 
				 	 */
			
 
				 	struct percpu_counter total_bytes_pinned;
			
 
				 
			
@@ -1822,6 +1819,9 @@ struct btrfs_fs_info {
 
				 	spinlock_t reada_lock;
			
 
				 	struct radix_tree_root reada_tree;
			
 
				 
			
 
				+	/* readahead works cnt */
			
 
				+	atomic_t reada_works_cnt;
			
 
				+
			
 
				 	/* Extent buffer radix tree */
			
 
				 	spinlock_t buffer_lock;
			
 
				 	struct radix_tree_root buffer_radix;
			
@@ -2185,13 +2185,43 @@ struct btrfs_ioctl_defrag_range_args {
 
				  */
			
 
				 #define BTRFS_QGROUP_RELATION_KEY       246
			
 
				 
			
 
				+/*
			
 
				+ * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
			
 
				+ */
			
 
				 #define BTRFS_BALANCE_ITEM_KEY	248
			
 
				 
			
 
				 /*
			
 
				- * Persistantly stores the io stats in the device tree.
			
 
				- * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
			
 
				+ * The key type for tree items that are stored persistently, but do not need to
			
 
				+ * exist for extended period of time. The items can exist in any tree.
			
 
				+ *
			
 
				+ * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
			
 
				+ *
			
 
				+ * Existing items:
			
 
				+ *
			
 
				+ * - balance status item
			
 
				+ *   (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
			
 
				  */
			
 
				-#define BTRFS_DEV_STATS_KEY	249
			
 
				+#define BTRFS_TEMPORARY_ITEM_KEY	248
			
 
				+
			
 
				+/*
			
 
				+ * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
			
 
				+ */
			
 
				+#define BTRFS_DEV_STATS_KEY		249
			
 
				+
			
 
				+/*
			
 
				+ * The key type for tree items that are stored persistently and usually exist
			
 
				+ * for a long period, eg. filesystem lifetime. The item kinds can be status
			
 
				+ * information, stats or preference values. The item can exist in any tree.
			
 
				+ *
			
 
				+ * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
			
 
				+ *
			
 
				+ * Existing items:
			
 
				+ *
			
 
				+ * - device statistics, store IO stats in the device tree, one key for all
			
 
				+ *   stats
			
 
				+ *   (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
			
 
				+ */
			
 
				+#define BTRFS_PERSISTENT_ITEM_KEY	249
			
 
				 
			
 
				 /*
			
 
				  * Persistantly stores the device replace state in the device tree.
			
@@ -2241,7 +2271,7 @@ struct btrfs_ioctl_defrag_range_args {
 
				 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
			
 
				 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
			
 
				 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
			
 
				-#define BTRFS_MOUNT_RECOVERY		(1 << 18)
			
 
				+#define BTRFS_MOUNT_USEBACKUPROOT	(1 << 18)
			
 
				 #define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
			
 
				 #define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
			
 
				 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
			
@@ -2250,9 +2280,10 @@ struct btrfs_ioctl_defrag_range_args {
 
				 #define BTRFS_MOUNT_FRAGMENT_DATA	(1 << 24)
			
 
				 #define BTRFS_MOUNT_FRAGMENT_METADATA	(1 << 25)
			
 
				 #define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
			
 
				+#define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
			
 
				 
			
 
				 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
			
 
				-#define BTRFS_DEFAULT_MAX_INLINE	(8192)
			
 
				+#define BTRFS_DEFAULT_MAX_INLINE	(2048)
			
 
				 
			
 
				 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
			
 
				 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
			
@@ -2353,6 +2384,9 @@ struct btrfs_map_token {
 
				 	unsigned long offset;
			
 
				 };
			
 
				 
			
 
				+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
			
 
				+				((bytes) >> (fs_info)->sb->s_blocksize_bits)
			
 
				+
			
 
				 static inline void btrfs_init_map_token (struct btrfs_map_token *token)
			
 
				 {
			
 
				 	token->kaddr = NULL;
			
@@ -3448,8 +3482,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
 
				 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
			
 
				 						 unsigned num_items)
			
 
				 {
			
 
				-	return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
			
 
				-		2 * num_items;
			
 
				+	return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -4027,7 +4060,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 
				 			struct btrfs_root *root,
			
 
				 			struct inode *dir, u64 objectid,
			
 
				 			const char *name, int name_len);
			
 
				-int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
			
 
				+int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
			
 
				 			int front);
			
 
				 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
			
 
				 			       struct btrfs_root *root,
			
@@ -4089,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
 
				 
			
 
				 /* ioctl.c */
			
 
				 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
			
 
				+int btrfs_ioctl_get_supported_features(void __user *arg);
			
 
				 void btrfs_update_iflags(struct inode *inode);
			
 
				 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
			
 
				 int btrfs_is_empty_uuid(u8 *uuid);
			
@@ -4151,7 +4185,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 
				 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
			
 
				 
			
 
				 /* super.c */
			
 
				-int btrfs_parse_options(struct btrfs_root *root, char *options);
			
 
				+int btrfs_parse_options(struct btrfs_root *root, char *options,
			
 
				+			unsigned long new_flags);
			
 
				 int btrfs_sync_fs(struct super_block *sb, int wait);
			
 
				 
			
 
				 #ifdef CONFIG_PRINTK
			
@@ -4525,8 +4560,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 
				 			      struct btrfs_key *start, struct btrfs_key *end);
			
 
				 int btrfs_reada_wait(void *handle);
			
 
				 void btrfs_reada_detach(void *handle);
			
 
				-int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				-			 u64 start, int err);
			
 
				+int btree_readahead_hook(struct btrfs_fs_info *fs_info,
			
 
				+			 struct extent_buffer *eb, u64 start, int err);
			
 
				 
			
 
				 static inline int is_fstree(u64 rootid)
			
 
				 {
			
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -43,8 +43,7 @@ int __init btrfs_delayed_inode_init(void)
 
				 
			
 
				 void btrfs_delayed_inode_exit(void)
			
 
				 {
			
 
				-	if (delayed_node_cache)
			
 
				-		kmem_cache_destroy(delayed_node_cache);
			
 
				+	kmem_cache_destroy(delayed_node_cache);
			
 
				 }
			
 
				 
			
 
				 static inline void btrfs_init_delayed_node(
			
@@ -651,9 +650,14 @@ static int btrfs_delayed_inode_reserve_metadata(
 
				 			goto out;
			
 
				 
			
 
				 		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
			
 
				-		if (!WARN_ON(ret))
			
 
				+		if (!ret)
			
 
				 			goto out;
			
 
				 
			
 
				+		if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
			
 
				+			btrfs_debug(root->fs_info,
			
 
				+				    "block rsv migrate returned %d", ret);
			
 
				+			WARN_ON(1);
			
 
				+		}
			
 
				 		/*
			
 
				 		 * Ok this is a problem, let's just steal from the global rsv
			
 
				 		 * since this really shouldn't happen that often.
			
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -929,14 +929,10 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 
				 
			
 
				 void btrfs_delayed_ref_exit(void)
			
 
				 {
			
 
				-	if (btrfs_delayed_ref_head_cachep)
			
 
				-		kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
			
 
				-	if (btrfs_delayed_tree_ref_cachep)
			
 
				-		kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
			
 
				-	if (btrfs_delayed_data_ref_cachep)
			
 
				-		kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
			
 
				-	if (btrfs_delayed_extent_op_cachep)
			
 
				-		kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
			
 
				+	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
			
 
				+	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
			
 
				+	kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
			
 
				+	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
			
 
				 }
			
 
				 
			
 
				 int btrfs_delayed_ref_init(void)
			
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_dev_replace_item *ptr;
			
 
				 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 0);
			
 
				 	if (!dev_replace->is_valid ||
			
 
				 	    !dev_replace->item_needs_writeback) {
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				 		return 0;
			
 
				 	}
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				 
			
 
				 	key.objectid = 0;
			
 
				 	key.type = BTRFS_DEV_REPLACE_KEY;
			
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 
				 	ptr = btrfs_item_ptr(eb, path->slots[0],
			
 
				 			     struct btrfs_dev_replace_item);
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 	if (dev_replace->srcdev)
			
 
				 		btrfs_set_dev_replace_src_devid(eb, ptr,
			
 
				 			dev_replace->srcdev->devid);
			
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 
				 	btrfs_set_dev_replace_cursor_right(eb, ptr,
			
 
				 		dev_replace->cursor_right);
			
 
				 	dev_replace->item_needs_writeback = 0;
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 
			
 
				 	btrfs_mark_buffer_dirty(eb);
			
 
				 
			
@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 
				 		return PTR_ERR(trans);
			
 
				 	}
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 	switch (dev_replace->replace_state) {
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 
				 	dev_replace->is_valid = 1;
			
 
				 	dev_replace->item_needs_writeback = 1;
			
 
				 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 
			
 
				 	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
			
 
				 	if (ret)
			
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 
				 	trans = btrfs_start_transaction(root, 0);
			
 
				 	if (IS_ERR(trans)) {
			
 
				 		ret = PTR_ERR(trans);
			
 
				-		btrfs_dev_replace_lock(dev_replace);
			
 
				+		btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 		goto leave;
			
 
				 	}
			
 
				 
			
@@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 
				 leave:
			
 
				 	dev_replace->srcdev = NULL;
			
 
				 	dev_replace->tgtdev = NULL;
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 	btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
			
 
				 	return ret;
			
 
				 }
			
@@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 	/* don't allow cancel or unmount to disturb the finishing procedure */
			
 
				 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 0);
			
 
				 	/* was the operation canceled, or is it finished? */
			
 
				 	if (dev_replace->replace_state !=
			
 
				 	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				 	tgt_device = dev_replace->tgtdev;
			
 
				 	src_device = dev_replace->srcdev;
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				 
			
 
				 	/*
			
 
				 	 * flush all outstanding I/O and inode extent mappings before the
			
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 	/* keep away write_all_supers() during the finishing procedure */
			
 
				 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 	mutex_lock(&root->fs_info->chunk_mutex);
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 	dev_replace->replace_state =
			
 
				 		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
			
 
				 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
			
@@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 			        rcu_str_deref(src_device->name),
			
 
				 			      src_device->devid,
			
 
				 			      rcu_str_deref(tgt_device->name), scrub_ret);
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 		mutex_unlock(&root->fs_info->chunk_mutex);
			
 
				 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
			
 
				 		mutex_unlock(&uuid_mutex);
			
@@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
				 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
			
 
				 	fs_info->fs_devices->rw_devices++;
			
 
				 
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 
			
 
				 	btrfs_rm_dev_replace_blocked(fs_info);
			
 
				 
			
@@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 
				 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				 	struct btrfs_device *srcdev;
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 0);
			
 
				 	/* even if !dev_replace_is_valid, the values are good enough for
			
 
				 	 * the replace_status ioctl */
			
 
				 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
			
@@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 
				 			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
			
 
				 		break;
			
 
				 	}
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				 }
			
 
				 
			
 
				 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
			
@@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 
				 		return -EROFS;
			
 
				 
			
 
				 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 	switch (dev_replace->replace_state) {
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 		goto leave;
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
			
@@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 
				 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
			
 
				 	dev_replace->time_stopped = get_seconds();
			
 
				 	dev_replace->item_needs_writeback = 1;
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 	btrfs_scrub_cancel(fs_info);
			
 
				 
			
 
				 	trans = btrfs_start_transaction(root, 0);
			
@@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 
				 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				 
			
 
				 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 	switch (dev_replace->replace_state) {
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
@@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 
				 		break;
			
 
				 	}
			
 
				 
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			
 
				 }
			
 
				 
			
@@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 
				 	struct task_struct *task;
			
 
				 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 1);
			
 
				 	switch (dev_replace->replace_state) {
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 		return 0;
			
 
				 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
			
 
				 		break;
			
@@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 
				 		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
			
 
				 		btrfs_info(fs_info,
			
 
				 			"you may cancel the operation after 'mount -o degraded'");
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 		return 0;
			
 
				 	}
			
 
				-	btrfs_dev_replace_unlock(dev_replace);
			
 
				+	btrfs_dev_replace_unlock(dev_replace, 1);
			
 
				 
			
 
				 	WARN_ON(atomic_xchg(
			
 
				 		&fs_info->mutually_exclusive_operation_running, 1));
			
@@ -802,7 +802,7 @@ static int btrfs_dev_replace_kthread(void *data)
 
				 	struct btrfs_ioctl_dev_replace_args *status_args;
			
 
				 	u64 progress;
			
 
				 
			
 
				-	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
			
 
				+	status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
			
 
				 	if (status_args) {
			
 
				 		btrfs_dev_replace_status(fs_info, status_args);
			
 
				 		progress = status_args->status.progress_1000;
			
@@ -858,55 +858,65 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 
				 		 * not called and the the filesystem is remounted
			
 
				 		 * in degraded state. This does not stop the
			
 
				 		 * dev_replace procedure. It needs to be canceled
			
 
				-		 * manually if the cancelation is wanted.
			
 
				+		 * manually if the cancellation is wanted.
			
 
				 		 */
			
 
				 		break;
			
 
				 	}
			
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
			
 
				+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
			
 
				 {
			
 
				-	/* the beginning is just an optimization for the typical case */
			
 
				-	if (atomic_read(&dev_replace->nesting_level) == 0) {
			
 
				-acquire_lock:
			
 
				-		/* this is not a nested case where the same thread
			
 
				-		 * is trying to acqurire the same lock twice */
			
 
				-		mutex_lock(&dev_replace->lock);
			
 
				-		mutex_lock(&dev_replace->lock_management_lock);
			
 
				-		dev_replace->lock_owner = current->pid;
			
 
				-		atomic_inc(&dev_replace->nesting_level);
			
 
				-		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				-		return;
			
 
				+	if (rw == 1) {
			
 
				+		/* write */
			
 
				+again:
			
 
				+		wait_event(dev_replace->read_lock_wq,
			
 
				+			   atomic_read(&dev_replace->blocking_readers) == 0);
			
 
				+		write_lock(&dev_replace->lock);
			
 
				+		if (atomic_read(&dev_replace->blocking_readers)) {
			
 
				+			write_unlock(&dev_replace->lock);
			
 
				+			goto again;
			
 
				+		}
			
 
				+	} else {
			
 
				+		read_lock(&dev_replace->lock);
			
 
				+		atomic_inc(&dev_replace->read_locks);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	mutex_lock(&dev_replace->lock_management_lock);
			
 
				-	if (atomic_read(&dev_replace->nesting_level) > 0 &&
			
 
				-	    dev_replace->lock_owner == current->pid) {
			
 
				-		WARN_ON(!mutex_is_locked(&dev_replace->lock));
			
 
				-		atomic_inc(&dev_replace->nesting_level);
			
 
				-		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				-		return;
			
 
				+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
			
 
				+{
			
 
				+	if (rw == 1) {
			
 
				+		/* write */
			
 
				+		ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
			
 
				+		write_unlock(&dev_replace->lock);
			
 
				+	} else {
			
 
				+		ASSERT(atomic_read(&dev_replace->read_locks) > 0);
			
 
				+		atomic_dec(&dev_replace->read_locks);
			
 
				+		read_unlock(&dev_replace->lock);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	mutex_unlock(&dev_replace->lock_management_lock);
			
 
				-	goto acquire_lock;
			
 
				+/* inc blocking cnt and release read lock */
			
 
				+void btrfs_dev_replace_set_lock_blocking(
			
 
				+					struct btrfs_dev_replace *dev_replace)
			
 
				+{
			
 
				+	/* only set blocking for read lock */
			
 
				+	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
			
 
				+	atomic_inc(&dev_replace->blocking_readers);
			
 
				+	read_unlock(&dev_replace->lock);
			
 
				 }
			
 
				 
			
 
				-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
			
 
				+/* acquire read lock and dec blocking cnt */
			
 
				+void btrfs_dev_replace_clear_lock_blocking(
			
 
				+					struct btrfs_dev_replace *dev_replace)
			
 
				 {
			
 
				-	WARN_ON(!mutex_is_locked(&dev_replace->lock));
			
 
				-	mutex_lock(&dev_replace->lock_management_lock);
			
 
				-	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
			
 
				-	WARN_ON(dev_replace->lock_owner != current->pid);
			
 
				-	atomic_dec(&dev_replace->nesting_level);
			
 
				-	if (atomic_read(&dev_replace->nesting_level) == 0) {
			
 
				-		dev_replace->lock_owner = 0;
			
 
				-		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				-		mutex_unlock(&dev_replace->lock);
			
 
				-	} else {
			
 
				-		mutex_unlock(&dev_replace->lock_management_lock);
			
 
				-	}
			
 
				+	/* only set blocking for read lock */
			
 
				+	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
			
 
				+	ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
			
 
				+	read_lock(&dev_replace->lock);
			
 
				+	if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
			
 
				+	    waitqueue_active(&dev_replace->read_lock_wq))
			
 
				+		wake_up(&dev_replace->read_lock_wq);
			
 
				 }
			
 
				 
			
 
				 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
			
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
 
				 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
			
 
				 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
			
 
				-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
			
 
				-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
			
 
				+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
			
 
				+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
			
 
				+void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
			
 
				+void btrfs_dev_replace_clear_lock_blocking(
			
 
				+					struct btrfs_dev_replace *dev_replace);
			
 
				 
			
 
				 static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
			
 
				 {
			
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
 
				 #include "raid56.h"
			
 
				 #include "sysfs.h"
			
 
				 #include "qgroup.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 #ifdef CONFIG_X86
			
 
				 #include <asm/cpufeature.h>
			
@@ -110,8 +111,7 @@ int __init btrfs_end_io_wq_init(void)
 
				 
			
 
				 void btrfs_end_io_wq_exit(void)
			
 
				 {
			
 
				-	if (btrfs_end_io_wq_cache)
			
 
				-		kmem_cache_destroy(btrfs_end_io_wq_cache);
			
 
				+	kmem_cache_destroy(btrfs_end_io_wq_cache);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -612,6 +612,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
				 	int found_level;
			
 
				 	struct extent_buffer *eb;
			
 
				 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	int ret = 0;
			
 
				 	int reads_done;
			
 
				 
			
@@ -637,21 +638,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
				 
			
 
				 	found_start = btrfs_header_bytenr(eb);
			
 
				 	if (found_start != eb->start) {
			
 
				-		btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
			
 
				-			       found_start, eb->start);
			
 
				+		btrfs_err_rl(fs_info, "bad tree block start %llu %llu",
			
 
				+			     found_start, eb->start);
			
 
				 		ret = -EIO;
			
 
				 		goto err;
			
 
				 	}
			
 
				-	if (check_tree_block_fsid(root->fs_info, eb)) {
			
 
				-		btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
			
 
				-			       eb->start);
			
 
				+	if (check_tree_block_fsid(fs_info, eb)) {
			
 
				+		btrfs_err_rl(fs_info, "bad fsid on block %llu",
			
 
				+			     eb->start);
			
 
				 		ret = -EIO;
			
 
				 		goto err;
			
 
				 	}
			
 
				 	found_level = btrfs_header_level(eb);
			
 
				 	if (found_level >= BTRFS_MAX_LEVEL) {
			
 
				-		btrfs_err(root->fs_info, "bad tree block level %d",
			
 
				-			   (int)btrfs_header_level(eb));
			
 
				+		btrfs_err(fs_info, "bad tree block level %d",
			
 
				+			  (int)btrfs_header_level(eb));
			
 
				 		ret = -EIO;
			
 
				 		goto err;
			
 
				 	}
			
@@ -659,7 +660,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
				 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
			
 
				 				       eb, found_level);
			
 
				 
			
 
				-	ret = csum_tree_block(root->fs_info, eb, 1);
			
 
				+	ret = csum_tree_block(fs_info, eb, 1);
			
 
				 	if (ret) {
			
 
				 		ret = -EIO;
			
 
				 		goto err;
			
@@ -680,7 +681,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
				 err:
			
 
				 	if (reads_done &&
			
 
				 	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
			
 
				-		btree_readahead_hook(root, eb, eb->start, ret);
			
 
				+		btree_readahead_hook(fs_info, eb, eb->start, ret);
			
 
				 
			
 
				 	if (ret) {
			
 
				 		/*
			
@@ -699,14 +700,13 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
				 static int btree_io_failed_hook(struct page *page, int failed_mirror)
			
 
				 {
			
 
				 	struct extent_buffer *eb;
			
 
				-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
			
 
				 
			
 
				 	eb = (struct extent_buffer *)page->private;
			
 
				 	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
			
 
				 	eb->read_mirror = failed_mirror;
			
 
				 	atomic_dec(&eb->io_pages);
			
 
				 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
			
 
				-		btree_readahead_hook(root, eb, eb->start, -EIO);
			
 
				+		btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
			
 
				 	return -EIO;	/* we fixed nothing */
			
 
				 }
			
 
				 
			
@@ -816,7 +816,7 @@ static void run_one_async_done(struct btrfs_work *work)
 
				 	    waitqueue_active(&fs_info->async_submit_wait))
			
 
				 		wake_up(&fs_info->async_submit_wait);
			
 
				 
			
 
				-	/* If an error occured we just want to clean up the bio and move on */
			
 
				+	/* If an error occurred we just want to clean up the bio and move on */
			
 
				 	if (async->error) {
			
 
				 		async->bio->bi_error = async->error;
			
 
				 		bio_endio(async->bio);
			
@@ -1296,9 +1296,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
 
				 	spin_lock_init(&root->root_item_lock);
			
 
				 }
			
 
				 
			
 
				-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
			
 
				+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
			
 
				+		gfp_t flags)
			
 
				 {
			
 
				-	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
			
 
				+	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
			
 
				 	if (root)
			
 
				 		root->fs_info = fs_info;
			
 
				 	return root;
			
@@ -1310,7 +1311,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
 
				 {
			
 
				 	struct btrfs_root *root;
			
 
				 
			
 
				-	root = btrfs_alloc_root(NULL);
			
 
				+	root = btrfs_alloc_root(NULL, GFP_KERNEL);
			
 
				 	if (!root)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 	__setup_root(4096, 4096, 4096, root, NULL, 1);
			
@@ -1332,7 +1333,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 
				 	int ret = 0;
			
 
				 	uuid_le uuid;
			
 
				 
			
 
				-	root = btrfs_alloc_root(fs_info);
			
 
				+	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
			
 
				 	if (!root)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
@@ -1408,7 +1409,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 
				 	struct btrfs_root *tree_root = fs_info->tree_root;
			
 
				 	struct extent_buffer *leaf;
			
 
				 
			
 
				-	root = btrfs_alloc_root(fs_info);
			
 
				+	root = btrfs_alloc_root(fs_info, GFP_NOFS);
			
 
				 	if (!root)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
@@ -1506,7 +1507,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 
				 	if (!path)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				-	root = btrfs_alloc_root(fs_info);
			
 
				+	root = btrfs_alloc_root(fs_info, GFP_NOFS);
			
 
				 	if (!root) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto alloc_fail;
			
@@ -2272,9 +2273,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
 
				 	fs_info->dev_replace.lock_owner = 0;
			
 
				 	atomic_set(&fs_info->dev_replace.nesting_level, 0);
			
 
				 	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
			
 
				-	mutex_init(&fs_info->dev_replace.lock_management_lock);
			
 
				-	mutex_init(&fs_info->dev_replace.lock);
			
 
				+	rwlock_init(&fs_info->dev_replace.lock);
			
 
				+	atomic_set(&fs_info->dev_replace.read_locks, 0);
			
 
				+	atomic_set(&fs_info->dev_replace.blocking_readers, 0);
			
 
				 	init_waitqueue_head(&fs_info->replace_wait);
			
 
				+	init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
			
 
				 }
			
 
				 
			
 
				 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
			
@@ -2385,7 +2388,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 
				 		return -EIO;
			
 
				 	}
			
 
				 
			
 
				-	log_tree_root = btrfs_alloc_root(fs_info);
			
 
				+	log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
			
 
				 	if (!log_tree_root)
			
 
				 		return -ENOMEM;
			
 
				 
			
@@ -2510,8 +2513,8 @@ int open_ctree(struct super_block *sb,
 
				 	int backup_index = 0;
			
 
				 	int max_active;
			
 
				 
			
 
				-	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
			
 
				-	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
			
 
				+	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
			
 
				+	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
			
 
				 	if (!tree_root || !chunk_root) {
			
 
				 		err = -ENOMEM;
			
 
				 		goto fail;
			
@@ -2603,6 +2606,7 @@ int open_ctree(struct super_block *sb,
 
				 	atomic_set(&fs_info->nr_async_bios, 0);
			
 
				 	atomic_set(&fs_info->defrag_running, 0);
			
 
				 	atomic_set(&fs_info->qgroup_op_seq, 0);
			
 
				+	atomic_set(&fs_info->reada_works_cnt, 0);
			
 
				 	atomic64_set(&fs_info->tree_mod_seq, 0);
			
 
				 	fs_info->sb = sb;
			
 
				 	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
			
@@ -2622,7 +2626,7 @@ int open_ctree(struct super_block *sb,
 
				 	INIT_LIST_HEAD(&fs_info->ordered_roots);
			
 
				 	spin_lock_init(&fs_info->ordered_root_lock);
			
 
				 	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
			
 
				-					GFP_NOFS);
			
 
				+					GFP_KERNEL);
			
 
				 	if (!fs_info->delayed_root) {
			
 
				 		err = -ENOMEM;
			
 
				 		goto fail_iput;
			
@@ -2750,7 +2754,7 @@ int open_ctree(struct super_block *sb,
 
				 	 */
			
 
				 	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
			
 
				 
			
 
				-	ret = btrfs_parse_options(tree_root, options);
			
 
				+	ret = btrfs_parse_options(tree_root, options, sb->s_flags);
			
 
				 	if (ret) {
			
 
				 		err = ret;
			
 
				 		goto fail_alloc;
			
@@ -3029,8 +3033,9 @@ int open_ctree(struct super_block *sb,
 
				 	if (ret)
			
 
				 		goto fail_trans_kthread;
			
 
				 
			
 
				-	/* do not make disk changes in broken FS */
			
 
				-	if (btrfs_super_log_root(disk_super) != 0) {
			
 
				+	/* do not make disk changes in broken FS or nologreplay is given */
			
 
				+	if (btrfs_super_log_root(disk_super) != 0 &&
			
 
				+	    !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
			
 
				 		ret = btrfs_replay_log(fs_info, fs_devices);
			
 
				 		if (ret) {
			
 
				 			err = ret;
			
@@ -3146,6 +3151,12 @@ int open_ctree(struct super_block *sb,
 
				 
			
 
				 	fs_info->open = 1;
			
 
				 
			
 
				+	/*
			
 
				+	 * backuproot only affect mount behavior, and if open_ctree succeeded,
			
 
				+	 * no need to keep the flag
			
 
				+	 */
			
 
				+	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
			
 
				+
			
 
				 	return 0;
			
 
				 
			
 
				 fail_qgroup:
			
@@ -3200,7 +3211,7 @@ int open_ctree(struct super_block *sb,
 
				 	return err;
			
 
				 
			
 
				 recovery_tree_root:
			
 
				-	if (!btrfs_test_opt(tree_root, RECOVERY))
			
 
				+	if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
			
 
				 		goto fail_tree_roots;
			
 
				 
			
 
				 	free_root_pointers(fs_info, 0);
			
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4838,7 +4838,7 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
 
				 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
			
 
				 
			
 
				 	/* If we're just plain full then async reclaim just slows us down. */
			
 
				-	if (space_info->bytes_used >= thresh)
			
 
				+	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
			
 
				 		return 0;
			
 
				 
			
 
				 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
			
@@ -5373,27 +5373,33 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
				 
			
 
				 	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
			
 
				 
			
 
				-	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
			
 
				-		    sinfo->bytes_reserved + sinfo->bytes_readonly +
			
 
				-		    sinfo->bytes_may_use;
			
 
				-
			
 
				-	if (sinfo->total_bytes > num_bytes) {
			
 
				-		num_bytes = sinfo->total_bytes - num_bytes;
			
 
				-		block_rsv->reserved += num_bytes;
			
 
				-		sinfo->bytes_may_use += num_bytes;
			
 
				-		trace_btrfs_space_reservation(fs_info, "space_info",
			
 
				-				      sinfo->flags, num_bytes, 1);
			
 
				-	}
			
 
				-
			
 
				-	if (block_rsv->reserved >= block_rsv->size) {
			
 
				+	if (block_rsv->reserved < block_rsv->size) {
			
 
				+		num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
			
 
				+			sinfo->bytes_reserved + sinfo->bytes_readonly +
			
 
				+			sinfo->bytes_may_use;
			
 
				+		if (sinfo->total_bytes > num_bytes) {
			
 
				+			num_bytes = sinfo->total_bytes - num_bytes;
			
 
				+			num_bytes = min(num_bytes,
			
 
				+					block_rsv->size - block_rsv->reserved);
			
 
				+			block_rsv->reserved += num_bytes;
			
 
				+			sinfo->bytes_may_use += num_bytes;
			
 
				+			trace_btrfs_space_reservation(fs_info, "space_info",
			
 
				+						      sinfo->flags, num_bytes,
			
 
				+						      1);
			
 
				+		}
			
 
				+	} else if (block_rsv->reserved > block_rsv->size) {
			
 
				 		num_bytes = block_rsv->reserved - block_rsv->size;
			
 
				 		sinfo->bytes_may_use -= num_bytes;
			
 
				 		trace_btrfs_space_reservation(fs_info, "space_info",
			
 
				 				      sinfo->flags, num_bytes, 0);
			
 
				 		block_rsv->reserved = block_rsv->size;
			
 
				-		block_rsv->full = 1;
			
 
				 	}
			
 
				 
			
 
				+	if (block_rsv->reserved == block_rsv->size)
			
 
				+		block_rsv->full = 1;
			
 
				+	else
			
 
				+		block_rsv->full = 0;
			
 
				+
			
 
				 	spin_unlock(&block_rsv->lock);
			
 
				 	spin_unlock(&sinfo->lock);
			
 
				 }
			
@@ -5752,7 +5758,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
				 
			
 
				 		/*
			
 
				 		 * This is tricky, but first we need to figure out how much we
			
 
				-		 * free'd from any free-ers that occured during this
			
 
				+		 * free'd from any free-ers that occurred during this
			
 
				 		 * reservation, so we reset ->csum_bytes to the csum_bytes
			
 
				 		 * before we dropped our lock, and then call the free for the
			
 
				 		 * number of bytes that were freed while we were trying our
			
@@ -7018,7 +7024,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
 
				 		   struct btrfs_free_cluster *cluster,
			
 
				 		   int delalloc)
			
 
				 {
			
 
				-	struct btrfs_block_group_cache *used_bg;
			
 
				+	struct btrfs_block_group_cache *used_bg = NULL;
			
 
				 	bool locked = false;
			
 
				 again:
			
 
				 	spin_lock(&cluster->refill_lock);
			
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -206,10 +206,8 @@ void extent_io_exit(void)
 
				 	 * destroy caches.
			
 
				 	 */
			
 
				 	rcu_barrier();
			
 
				-	if (extent_state_cache)
			
 
				-		kmem_cache_destroy(extent_state_cache);
			
 
				-	if (extent_buffer_cache)
			
 
				-		kmem_cache_destroy(extent_buffer_cache);
			
 
				+	kmem_cache_destroy(extent_state_cache);
			
 
				+	kmem_cache_destroy(extent_buffer_cache);
			
 
				 	if (btrfs_bioset)
			
 
				 		bioset_free(btrfs_bioset);
			
 
				 }
			
@@ -232,7 +230,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 
				 	if (!state)
			
 
				 		return state;
			
 
				 	state->state = 0;
			
 
				-	state->private = 0;
			
 
				+	state->failrec = NULL;
			
 
				 	RB_CLEAR_NODE(&state->rb_node);
			
 
				 	btrfs_leak_debug_add(&state->leak_list, &states);
			
 
				 	atomic_set(&state->refs, 1);
			
@@ -1844,7 +1842,8 @@ u64 count_range_bits(struct extent_io_tree *tree,
 
				  * set the private field for a given byte offset in the tree.  If there isn't
			
 
				  * an extent_state there already, this does nothing.
			
 
				  */
			
 
				-static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
			
 
				+static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
			
 
				+		struct io_failure_record *failrec)
			
 
				 {
			
 
				 	struct rb_node *node;
			
 
				 	struct extent_state *state;
			
@@ -1865,13 +1864,14 @@ static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private
 
				 		ret = -ENOENT;
			
 
				 		goto out;
			
 
				 	}
			
 
				-	state->private = private;
			
 
				+	state->failrec = failrec;
			
 
				 out:
			
 
				 	spin_unlock(&tree->lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
			
 
				+static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
			
 
				+		struct io_failure_record **failrec)
			
 
				 {
			
 
				 	struct rb_node *node;
			
 
				 	struct extent_state *state;
			
@@ -1892,7 +1892,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 
				 		ret = -ENOENT;
			
 
				 		goto out;
			
 
				 	}
			
 
				-	*private = state->private;
			
 
				+	*failrec = state->failrec;
			
 
				 out:
			
 
				 	spin_unlock(&tree->lock);
			
 
				 	return ret;
			
@@ -1972,7 +1972,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
 
				 	int err = 0;
			
 
				 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
			
 
				 
			
 
				-	set_state_private(failure_tree, rec->start, 0);
			
 
				+	set_state_failrec(failure_tree, rec->start, NULL);
			
 
				 	ret = clear_extent_bits(failure_tree, rec->start,
			
 
				 				rec->start + rec->len - 1,
			
 
				 				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
			
@@ -2089,7 +2089,6 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
 
				 		     unsigned int pg_offset)
			
 
				 {
			
 
				 	u64 private;
			
 
				-	u64 private_failure;
			
 
				 	struct io_failure_record *failrec;
			
 
				 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
			
 
				 	struct extent_state *state;
			
@@ -2102,12 +2101,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
 
				 	if (!ret)
			
 
				 		return 0;
			
 
				 
			
 
				-	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
			
 
				-				&private_failure);
			
 
				+	ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
			
 
				+			&failrec);
			
 
				 	if (ret)
			
 
				 		return 0;
			
 
				 
			
 
				-	failrec = (struct io_failure_record *)(unsigned long) private_failure;
			
 
				 	BUG_ON(!failrec->this_mirror);
			
 
				 
			
 
				 	if (failrec->in_validation) {
			
@@ -2167,7 +2165,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
 
				 
			
 
				 		next = next_state(state);
			
 
				 
			
 
				-		failrec = (struct io_failure_record *)(unsigned long)state->private;
			
 
				+		failrec = state->failrec;
			
 
				 		free_extent_state(state);
			
 
				 		kfree(failrec);
			
 
				 
			
@@ -2177,10 +2175,9 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
 
				 }
			
 
				 
			
 
				 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
			
 
				-				struct io_failure_record **failrec_ret)
			
 
				+		struct io_failure_record **failrec_ret)
			
 
				 {
			
 
				 	struct io_failure_record *failrec;
			
 
				-	u64 private;
			
 
				 	struct extent_map *em;
			
 
				 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
			
 
				 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			
@@ -2188,7 +2185,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
 
				 	int ret;
			
 
				 	u64 logical;
			
 
				 
			
 
				-	ret = get_state_private(failure_tree, start, &private);
			
 
				+	ret = get_state_failrec(failure_tree, start, &failrec);
			
 
				 	if (ret) {
			
 
				 		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
			
 
				 		if (!failrec)
			
@@ -2237,8 +2234,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
 
				 		ret = set_extent_bits(failure_tree, start, end,
			
 
				 					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
			
 
				 		if (ret >= 0)
			
 
				-			ret = set_state_private(failure_tree, start,
			
 
				-						(u64)(unsigned long)failrec);
			
 
				+			ret = set_state_failrec(failure_tree, start, failrec);
			
 
				 		/* set the bits in the inode's tree */
			
 
				 		if (ret >= 0)
			
 
				 			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
			
@@ -2248,7 +2244,6 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
 
				 			return ret;
			
 
				 		}
			
 
				 	} else {
			
 
				-		failrec = (struct io_failure_record *)(unsigned long)private;
			
 
				 		pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
			
 
				 			 failrec->logical, failrec->start, failrec->len,
			
 
				 			 failrec->in_validation);
			
@@ -3177,7 +3172,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
				 
			
 
				 	while (1) {
			
 
				 		lock_extent(tree, start, end);
			
 
				-		ordered = btrfs_lookup_ordered_extent(inode, start);
			
 
				+		ordered = btrfs_lookup_ordered_range(inode, start,
			
 
				+						PAGE_CACHE_SIZE);
			
 
				 		if (!ordered)
			
 
				 			break;
			
 
				 		unlock_extent(tree, start, end);
			
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -61,6 +61,7 @@
 
				 struct extent_state;
			
 
				 struct btrfs_root;
			
 
				 struct btrfs_io_bio;
			
 
				+struct io_failure_record;
			
 
				 
			
 
				 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
			
 
				 				       struct bio *bio, int mirror_num,
			
@@ -111,8 +112,7 @@ struct extent_state {
 
				 	atomic_t refs;
			
 
				 	unsigned state;
			
 
				 
			
 
				-	/* for use by the FS */
			
 
				-	u64 private;
			
 
				+	struct io_failure_record *failrec;
			
 
				 
			
 
				 #ifdef CONFIG_BTRFS_DEBUG
			
 
				 	struct list_head leak_list;
			
@@ -342,7 +342,6 @@ int extent_readpages(struct extent_io_tree *tree,
 
				 		     get_extent_t get_extent);
			
 
				 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
			
 
				 		__u64 start, __u64 len, get_extent_t *get_extent);
			
 
				-int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
			
 
				 void set_page_extent_mapped(struct page *page);
			
 
				 
			
 
				 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
			
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
 
				 #include <linux/hardirq.h>
			
 
				 #include "ctree.h"
			
 
				 #include "extent_map.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 
			
 
				 static struct kmem_cache *extent_map_cache;
			
@@ -20,8 +21,7 @@ int __init extent_map_init(void)
 
				 
			
 
				 void extent_map_exit(void)
			
 
				 {
			
 
				-	if (extent_map_cache)
			
 
				-		kmem_cache_destroy(extent_map_cache);
			
 
				+	kmem_cache_destroy(extent_map_cache);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
 
				 
			
 
				 /**
			
 
				  * free_extent_map - drop reference count of an extent_map
			
 
				- * @em:		extent map beeing releasead
			
 
				+ * @em:		extent map being releasead
			
 
				  *
			
 
				  * Drops the reference out on @em by one and free the structure
			
 
				  * if the reference count hits zero.
			
@@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 
				 /**
			
 
				  * remove_extent_mapping - removes an extent_map from the extent tree
			
 
				  * @tree:	extent tree to remove from
			
 
				- * @em:		extent map beeing removed
			
 
				+ * @em:		extent map being removed
			
 
				  *
			
 
				  * Removes @em from @tree.  No reference counts are dropped, and no checks
			
 
				  * are done to see if the range is in use
			
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,6 +25,7 @@
 
				 #include "transaction.h"
			
 
				 #include "volumes.h"
			
 
				 #include "print-tree.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
			
 
				 				   sizeof(struct btrfs_item) * 2) / \
			
@@ -172,6 +173,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 
				 	u64 item_start_offset = 0;
			
 
				 	u64 item_last_offset = 0;
			
 
				 	u64 disk_bytenr;
			
 
				+	u64 page_bytes_left;
			
 
				 	u32 diff;
			
 
				 	int nblocks;
			
 
				 	int bio_index = 0;
			
@@ -220,6 +222,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 
				 	disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
			
 
				 	if (dio)
			
 
				 		offset = logical_offset;
			
 
				+
			
 
				+	page_bytes_left = bvec->bv_len;
			
 
				 	while (bio_index < bio->bi_vcnt) {
			
 
				 		if (!dio)
			
 
				 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
			
@@ -243,7 +247,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 
				 				if (BTRFS_I(inode)->root->root_key.objectid ==
			
 
				 				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
			
 
				 					set_extent_bits(io_tree, offset,
			
 
				-						offset + bvec->bv_len - 1,
			
 
				+						offset + root->sectorsize - 1,
			
 
				 						EXTENT_NODATASUM, GFP_NOFS);
			
 
				 				} else {
			
 
				 					btrfs_info(BTRFS_I(inode)->root->fs_info,
			
@@ -281,13 +285,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 
				 found:
			
 
				 		csum += count * csum_size;
			
 
				 		nblocks -= count;
			
 
				-		bio_index += count;
			
 
				+
			
 
				 		while (count--) {
			
 
				-			disk_bytenr += bvec->bv_len;
			
 
				-			offset += bvec->bv_len;
			
 
				-			bvec++;
			
 
				+			disk_bytenr += root->sectorsize;
			
 
				+			offset += root->sectorsize;
			
 
				+			page_bytes_left -= root->sectorsize;
			
 
				+			if (!page_bytes_left) {
			
 
				+				bio_index++;
			
 
				+				/*
			
 
				+				 * make sure we're still inside the
			
 
				+				 * bio before we update page_bytes_left
			
 
				+				 */
			
 
				+				if (bio_index >= bio->bi_vcnt) {
			
 
				+					WARN_ON_ONCE(count);
			
 
				+					goto done;
			
 
				+				}
			
 
				+				bvec++;
			
 
				+				page_bytes_left = bvec->bv_len;
			
 
				+			}
			
 
				+
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+done:
			
 
				 	btrfs_free_path(path);
			
 
				 	return 0;
			
 
				 }
			
@@ -432,6 +452,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 
				 	struct bio_vec *bvec = bio->bi_io_vec;
			
 
				 	int bio_index = 0;
			
 
				 	int index;
			
 
				+	int nr_sectors;
			
 
				+	int i;
			
 
				 	unsigned long total_bytes = 0;
			
 
				 	unsigned long this_sum_bytes = 0;
			
 
				 	u64 offset;
			
@@ -459,41 +481,56 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 
				 		if (!contig)
			
 
				 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
			
 
				 
			
 
				-		if (offset >= ordered->file_offset + ordered->len ||
			
 
				-		    offset < ordered->file_offset) {
			
 
				-			unsigned long bytes_left;
			
 
				-			sums->len = this_sum_bytes;
			
 
				-			this_sum_bytes = 0;
			
 
				-			btrfs_add_ordered_sum(inode, ordered, sums);
			
 
				-			btrfs_put_ordered_extent(ordered);
			
 
				+		data = kmap_atomic(bvec->bv_page);
			
 
				 
			
 
				-			bytes_left = bio->bi_iter.bi_size - total_bytes;
			
 
				+		nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
			
 
				+						bvec->bv_len + root->sectorsize
			
 
				+						- 1);
			
 
				+
			
 
				+		for (i = 0; i < nr_sectors; i++) {
			
 
				+			if (offset >= ordered->file_offset + ordered->len ||
			
 
				+				offset < ordered->file_offset) {
			
 
				+				unsigned long bytes_left;
			
 
				+
			
 
				+				kunmap_atomic(data);
			
 
				+				sums->len = this_sum_bytes;
			
 
				+				this_sum_bytes = 0;
			
 
				+				btrfs_add_ordered_sum(inode, ordered, sums);
			
 
				+				btrfs_put_ordered_extent(ordered);
			
 
				+
			
 
				+				bytes_left = bio->bi_iter.bi_size - total_bytes;
			
 
				+
			
 
				+				sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
			
 
				+					GFP_NOFS);
			
 
				+				BUG_ON(!sums); /* -ENOMEM */
			
 
				+				sums->len = bytes_left;
			
 
				+				ordered = btrfs_lookup_ordered_extent(inode,
			
 
				+								offset);
			
 
				+				ASSERT(ordered); /* Logic error */
			
 
				+				sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
			
 
				+					+ total_bytes;
			
 
				+				index = 0;
			
 
				+
			
 
				+				data = kmap_atomic(bvec->bv_page);
			
 
				+			}
			
 
				 
			
 
				-			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
			
 
				-				       GFP_NOFS);
			
 
				-			BUG_ON(!sums); /* -ENOMEM */
			
 
				-			sums->len = bytes_left;
			
 
				-			ordered = btrfs_lookup_ordered_extent(inode, offset);
			
 
				-			BUG_ON(!ordered); /* Logic error */
			
 
				-			sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
			
 
				-				       total_bytes;
			
 
				-			index = 0;
			
 
				+			sums->sums[index] = ~(u32)0;
			
 
				+			sums->sums[index]
			
 
				+				= btrfs_csum_data(data + bvec->bv_offset
			
 
				+						+ (i * root->sectorsize),
			
 
				+						sums->sums[index],
			
 
				+						root->sectorsize);
			
 
				+			btrfs_csum_final(sums->sums[index],
			
 
				+					(char *)(sums->sums + index));
			
 
				+			index++;
			
 
				+			offset += root->sectorsize;
			
 
				+			this_sum_bytes += root->sectorsize;
			
 
				+			total_bytes += root->sectorsize;
			
 
				 		}
			
 
				 
			
 
				-		data = kmap_atomic(bvec->bv_page);
			
 
				-		sums->sums[index] = ~(u32)0;
			
 
				-		sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
			
 
				-						    sums->sums[index],
			
 
				-						    bvec->bv_len);
			
 
				 		kunmap_atomic(data);
			
 
				-		btrfs_csum_final(sums->sums[index],
			
 
				-				 (char *)(sums->sums + index));
			
 
				 
			
 
				 		bio_index++;
			
 
				-		index++;
			
 
				-		total_bytes += bvec->bv_len;
			
 
				-		this_sum_bytes += bvec->bv_len;
			
 
				-		offset += bvec->bv_len;
			
 
				 		bvec++;
			
 
				 	}
			
 
				 	this_sum_bytes = 0;
			
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 
				 #include "locking.h"
			
 
				 #include "volumes.h"
			
 
				 #include "qgroup.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 static struct kmem_cache *btrfs_inode_defrag_cachep;
			
 
				 /*
			
@@ -498,7 +499,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 
				 	loff_t isize = i_size_read(inode);
			
 
				 
			
 
				 	start_pos = pos & ~((u64)root->sectorsize - 1);
			
 
				-	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
			
 
				+	num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
			
 
				 
			
 
				 	end_of_last_block = start_pos + num_bytes - 1;
			
 
				 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
			
@@ -1379,16 +1380,19 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
 
				 static noinline int
			
 
				 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
			
 
				 				size_t num_pages, loff_t pos,
			
 
				+				size_t write_bytes,
			
 
				 				u64 *lockstart, u64 *lockend,
			
 
				 				struct extent_state **cached_state)
			
 
				 {
			
 
				+	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	u64 start_pos;
			
 
				 	u64 last_pos;
			
 
				 	int i;
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
			
 
				-	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
			
 
				+	start_pos = round_down(pos, root->sectorsize);
			
 
				+	last_pos = start_pos
			
 
				+		+ round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
			
 
				 
			
 
				 	if (start_pos < inode->i_size) {
			
 
				 		struct btrfs_ordered_extent *ordered;
			
@@ -1503,6 +1507,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 
			
 
				 	while (iov_iter_count(i) > 0) {
			
 
				 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
			
 
				+		size_t sector_offset;
			
 
				 		size_t write_bytes = min(iov_iter_count(i),
			
 
				 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
			
 
				 					 offset);
			
@@ -1511,6 +1516,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 		size_t reserve_bytes;
			
 
				 		size_t dirty_pages;
			
 
				 		size_t copied;
			
 
				+		size_t dirty_sectors;
			
 
				+		size_t num_sectors;
			
 
				 
			
 
				 		WARN_ON(num_pages > nrptrs);
			
 
				 
			
@@ -1523,29 +1530,29 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
			
 
				+		sector_offset = pos & (root->sectorsize - 1);
			
 
				+		reserve_bytes = round_up(write_bytes + sector_offset,
			
 
				+				root->sectorsize);
			
 
				 
			
 
				-		if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
			
 
				-					     BTRFS_INODE_PREALLOC)) {
			
 
				-			ret = check_can_nocow(inode, pos, &write_bytes);
			
 
				-			if (ret < 0)
			
 
				-				break;
			
 
				-			if (ret > 0) {
			
 
				-				/*
			
 
				-				 * For nodata cow case, no need to reserve
			
 
				-				 * data space.
			
 
				-				 */
			
 
				-				only_release_metadata = true;
			
 
				-				/*
			
 
				-				 * our prealloc extent may be smaller than
			
 
				-				 * write_bytes, so scale down.
			
 
				-				 */
			
 
				-				num_pages = DIV_ROUND_UP(write_bytes + offset,
			
 
				-							 PAGE_CACHE_SIZE);
			
 
				-				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
			
 
				-				goto reserve_metadata;
			
 
				-			}
			
 
				+		if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
			
 
				+					      BTRFS_INODE_PREALLOC)) &&
			
 
				+		    check_can_nocow(inode, pos, &write_bytes) > 0) {
			
 
				+			/*
			
 
				+			 * For nodata cow case, no need to reserve
			
 
				+			 * data space.
			
 
				+			 */
			
 
				+			only_release_metadata = true;
			
 
				+			/*
			
 
				+			 * our prealloc extent may be smaller than
			
 
				+			 * write_bytes, so scale down.
			
 
				+			 */
			
 
				+			num_pages = DIV_ROUND_UP(write_bytes + offset,
			
 
				+						 PAGE_CACHE_SIZE);
			
 
				+			reserve_bytes = round_up(write_bytes + sector_offset,
			
 
				+					root->sectorsize);
			
 
				+			goto reserve_metadata;
			
 
				 		}
			
 
				+
			
 
				 		ret = btrfs_check_data_free_space(inode, pos, write_bytes);
			
 
				 		if (ret < 0)
			
 
				 			break;
			
@@ -1576,8 +1583,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 			break;
			
 
				 
			
 
				 		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
			
 
				-						      pos, &lockstart, &lockend,
			
 
				-						      &cached_state);
			
 
				+						pos, write_bytes, &lockstart,
			
 
				+						&lockend, &cached_state);
			
 
				 		if (ret < 0) {
			
 
				 			if (ret == -EAGAIN)
			
 
				 				goto again;
			
@@ -1612,9 +1619,16 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 		 * we still have an outstanding extent for the chunk we actually
			
 
				 		 * managed to copy.
			
 
				 		 */
			
 
				-		if (num_pages > dirty_pages) {
			
 
				-			release_bytes = (num_pages - dirty_pages) <<
			
 
				-				PAGE_CACHE_SHIFT;
			
 
				+		num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
			
 
				+						reserve_bytes);
			
 
				+		dirty_sectors = round_up(copied + sector_offset,
			
 
				+					root->sectorsize);
			
 
				+		dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
			
 
				+						dirty_sectors);
			
 
				+
			
 
				+		if (num_sectors > dirty_sectors) {
			
 
				+			release_bytes = (write_bytes - copied)
			
 
				+				& ~((u64)root->sectorsize - 1);
			
 
				 			if (copied > 0) {
			
 
				 				spin_lock(&BTRFS_I(inode)->lock);
			
 
				 				BTRFS_I(inode)->outstanding_extents++;
			
@@ -1633,7 +1647,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
			
 
				+		release_bytes = round_up(copied + sector_offset,
			
 
				+					root->sectorsize);
			
 
				 
			
 
				 		if (copied > 0)
			
 
				 			ret = btrfs_dirty_pages(root, inode, pages,
			
@@ -1654,8 +1669,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
				 
			
 
				 		if (only_release_metadata && copied > 0) {
			
 
				 			lockstart = round_down(pos, root->sectorsize);
			
 
				-			lockend = lockstart +
			
 
				-				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
			
 
				+			lockend = round_up(pos + copied, root->sectorsize) - 1;
			
 
				 
			
 
				 			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
			
 
				 				       lockend, EXTENT_NORESERVE, NULL,
			
@@ -1761,6 +1775,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
				 	ssize_t err;
			
 
				 	loff_t pos;
			
 
				 	size_t count;
			
 
				+	loff_t oldsize;
			
 
				+	int clean_page = 0;
			
 
				 
			
 
				 	inode_lock(inode);
			
 
				 	err = generic_write_checks(iocb, from);
			
@@ -1799,14 +1815,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
				 	pos = iocb->ki_pos;
			
 
				 	count = iov_iter_count(from);
			
 
				 	start_pos = round_down(pos, root->sectorsize);
			
 
				-	if (start_pos > i_size_read(inode)) {
			
 
				+	oldsize = i_size_read(inode);
			
 
				+	if (start_pos > oldsize) {
			
 
				 		/* Expand hole size to cover write data, preventing empty gap */
			
 
				 		end_pos = round_up(pos + count, root->sectorsize);
			
 
				-		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
			
 
				+		err = btrfs_cont_expand(inode, oldsize, end_pos);
			
 
				 		if (err) {
			
 
				 			inode_unlock(inode);
			
 
				 			goto out;
			
 
				 		}
			
 
				+		if (start_pos > round_up(oldsize, root->sectorsize))
			
 
				+			clean_page = 1;
			
 
				 	}
			
 
				 
			
 
				 	if (sync)
			
@@ -1818,6 +1837,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
				 		num_written = __btrfs_buffered_write(file, from, pos);
			
 
				 		if (num_written > 0)
			
 
				 			iocb->ki_pos = pos + num_written;
			
 
				+		if (clean_page)
			
 
				+			pagecache_isize_extended(inode, oldsize,
			
 
				+						i_size_read(inode));
			
 
				 	}
			
 
				 
			
 
				 	inode_unlock(inode);
			
@@ -1825,7 +1847,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
				 	/*
			
 
				 	 * We also have to set last_sub_trans to the current log transid,
			
 
				 	 * otherwise subsequent syncs to a file that's been synced in this
			
 
				-	 * transaction will appear to have already occured.
			
 
				+	 * transaction will appear to have already occurred.
			
 
				 	 */
			
 
				 	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
			
@@ -1996,10 +2018,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
				 	 */
			
 
				 	smp_mb();
			
 
				 	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
			
 
				-	    (BTRFS_I(inode)->last_trans <=
			
 
				-	     root->fs_info->last_trans_committed &&
			
 
				-	     (full_sync ||
			
 
				-	      !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
			
 
				+	    (full_sync && BTRFS_I(inode)->last_trans <=
			
 
				+	     root->fs_info->last_trans_committed) ||
			
 
				+	    (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
			
 
				+	     BTRFS_I(inode)->last_trans
			
 
				+	     <= root->fs_info->last_trans_committed)) {
			
 
				 		/*
			
 
				 		 * We'v had everything committed since the last time we were
			
 
				 		 * modified so clear this flag in case it was set for whatever
			
@@ -2293,10 +2316,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 	int ret = 0;
			
 
				 	int err = 0;
			
 
				 	unsigned int rsv_count;
			
 
				-	bool same_page;
			
 
				+	bool same_block;
			
 
				 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
			
 
				 	u64 ino_size;
			
 
				-	bool truncated_page = false;
			
 
				+	bool truncated_block = false;
			
 
				 	bool updated_inode = false;
			
 
				 
			
 
				 	ret = btrfs_wait_ordered_range(inode, offset, len);
			
@@ -2304,7 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 		return ret;
			
 
				 
			
 
				 	inode_lock(inode);
			
 
				-	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
			
 
				+	ino_size = round_up(inode->i_size, root->sectorsize);
			
 
				 	ret = find_first_non_hole(inode, &offset, &len);
			
 
				 	if (ret < 0)
			
 
				 		goto out_only_mutex;
			
@@ -2317,31 +2340,30 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 	lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
			
 
				 	lockend = round_down(offset + len,
			
 
				 			     BTRFS_I(inode)->root->sectorsize) - 1;
			
 
				-	same_page = ((offset >> PAGE_CACHE_SHIFT) ==
			
 
				-		    ((offset + len - 1) >> PAGE_CACHE_SHIFT));
			
 
				-
			
 
				+	same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
			
 
				+		== (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
			
 
				 	/*
			
 
				-	 * We needn't truncate any page which is beyond the end of the file
			
 
				+	 * We needn't truncate any block which is beyond the end of the file
			
 
				 	 * because we are sure there is no data there.
			
 
				 	 */
			
 
				 	/*
			
 
				-	 * Only do this if we are in the same page and we aren't doing the
			
 
				-	 * entire page.
			
 
				+	 * Only do this if we are in the same block and we aren't doing the
			
 
				+	 * entire block.
			
 
				 	 */
			
 
				-	if (same_page && len < PAGE_CACHE_SIZE) {
			
 
				+	if (same_block && len < root->sectorsize) {
			
 
				 		if (offset < ino_size) {
			
 
				-			truncated_page = true;
			
 
				-			ret = btrfs_truncate_page(inode, offset, len, 0);
			
 
				+			truncated_block = true;
			
 
				+			ret = btrfs_truncate_block(inode, offset, len, 0);
			
 
				 		} else {
			
 
				 			ret = 0;
			
 
				 		}
			
 
				 		goto out_only_mutex;
			
 
				 	}
			
 
				 
			
 
				-	/* zero back part of the first page */
			
 
				+	/* zero back part of the first block */
			
 
				 	if (offset < ino_size) {
			
 
				-		truncated_page = true;
			
 
				-		ret = btrfs_truncate_page(inode, offset, 0, 0);
			
 
				+		truncated_block = true;
			
 
				+		ret = btrfs_truncate_block(inode, offset, 0, 0);
			
 
				 		if (ret) {
			
 
				 			inode_unlock(inode);
			
 
				 			return ret;
			
@@ -2376,9 +2398,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 		if (!ret) {
			
 
				 			/* zero the front end of the last page */
			
 
				 			if (tail_start + tail_len < ino_size) {
			
 
				-				truncated_page = true;
			
 
				-				ret = btrfs_truncate_page(inode,
			
 
				-						tail_start + tail_len, 0, 1);
			
 
				+				truncated_block = true;
			
 
				+				ret = btrfs_truncate_block(inode,
			
 
				+							tail_start + tail_len,
			
 
				+							0, 1);
			
 
				 				if (ret)
			
 
				 					goto out_only_mutex;
			
 
				 			}
			
@@ -2544,7 +2567,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 		goto out_free;
			
 
				 
			
 
				 	inode_inc_iversion(inode);
			
 
				-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
			
 
				+	inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 
			
 
				 	trans->block_rsv = &root->fs_info->trans_block_rsv;
			
 
				 	ret = btrfs_update_inode(trans, root, inode);
			
@@ -2558,7 +2581,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
			
 
				 			     &cached_state, GFP_NOFS);
			
 
				 out_only_mutex:
			
 
				-	if (!updated_inode && truncated_page && !ret && !err) {
			
 
				+	if (!updated_inode && truncated_block && !ret && !err) {
			
 
				 		/*
			
 
				 		 * If we only end up zeroing part of a page, we still need to
			
 
				 		 * update the inode item, so that all the time fields are
			
@@ -2611,7 +2634,7 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
 
				 		return 0;
			
 
				 	}
			
 
				 insert:
			
 
				-	range = kmalloc(sizeof(*range), GFP_NOFS);
			
 
				+	range = kmalloc(sizeof(*range), GFP_KERNEL);
			
 
				 	if (!range)
			
 
				 		return -ENOMEM;
			
 
				 	range->start = start;
			
@@ -2678,10 +2701,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 	} else if (offset + len > inode->i_size) {
			
 
				 		/*
			
 
				 		 * If we are fallocating from the end of the file onward we
			
 
				-		 * need to zero out the end of the page if i_size lands in the
			
 
				-		 * middle of a page.
			
 
				+		 * need to zero out the end of the block if i_size lands in the
			
 
				+		 * middle of a block.
			
 
				 		 */
			
 
				-		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
			
 
				+		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
			
 
				 		if (ret)
			
 
				 			goto out;
			
 
				 	}
			
@@ -2712,7 +2735,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
			
 
				 					     alloc_start, locked_end,
			
 
				-					     &cached_state, GFP_NOFS);
			
 
				+					     &cached_state, GFP_KERNEL);
			
 
				 			/*
			
 
				 			 * we can't wait on the range with the transaction
			
 
				 			 * running or with the extent lock held
			
@@ -2794,7 +2817,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 		if (IS_ERR(trans)) {
			
 
				 			ret = PTR_ERR(trans);
			
 
				 		} else {
			
 
				-			inode->i_ctime = CURRENT_TIME;
			
 
				+			inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 			i_size_write(inode, actual_end);
			
 
				 			btrfs_ordered_update_i_size(inode, actual_end, NULL);
			
 
				 			ret = btrfs_update_inode(trans, root, inode);
			
@@ -2806,7 +2829,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
				 	}
			
 
				 out_unlock:
			
 
				 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
			
 
				-			     &cached_state, GFP_NOFS);
			
 
				+			     &cached_state, GFP_KERNEL);
			
 
				 out:
			
 
				 	/*
			
 
				 	 * As we waited the extent range, the data_rsv_map must be empty
			
@@ -2939,8 +2962,7 @@ const struct file_operations btrfs_file_operations = {
 
				 
			
 
				 void btrfs_auto_defrag_exit(void)
			
 
				 {
			
 
				-	if (btrfs_inode_defrag_cachep)
			
 
				-		kmem_cache_destroy(btrfs_inode_defrag_cachep);
			
 
				+	kmem_cache_destroy(btrfs_inode_defrag_cachep);
			
 
				 }
			
 
				 
			
 
				 int btrfs_auto_defrag_init(void)
			
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -556,6 +556,9 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
 
				 	mutex_lock(&root->objectid_mutex);
			
 
				 
			
 
				 	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
			
 
				+		btrfs_warn(root->fs_info,
			
 
				+			   "the objectid of root %llu reaches its highest value",
			
 
				+			   root->root_key.objectid);
			
 
				 		ret = -ENOSPC;
			
 
				 		goto out;
			
 
				 	}
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -263,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 
				 		data_len = compressed_size;
			
 
				 
			
 
				 	if (start > 0 ||
			
 
				-	    actual_end > PAGE_CACHE_SIZE ||
			
 
				+	    actual_end > root->sectorsize ||
			
 
				 	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
			
 
				 	    (!compressed_size &&
			
 
				 	    (actual_end & (root->sectorsize - 1)) == 0) ||
			
@@ -2002,7 +2002,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 
				 	if (PagePrivate2(page))
			
 
				 		goto out;
			
 
				 
			
 
				-	ordered = btrfs_lookup_ordered_extent(inode, page_start);
			
 
				+	ordered = btrfs_lookup_ordered_range(inode, page_start,
			
 
				+					PAGE_CACHE_SIZE);
			
 
				 	if (ordered) {
			
 
				 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
			
 
				 				     page_end, &cached_state, GFP_NOFS);
			
@@ -4013,7 +4014,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
				 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
			
 
				 	inode_inc_iversion(inode);
			
 
				 	inode_inc_iversion(dir);
			
 
				-	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
			
 
				+	inode->i_ctime = dir->i_mtime =
			
 
				+		dir->i_ctime = current_fs_time(inode->i_sb);
			
 
				 	ret = btrfs_update_inode(trans, root, dir);
			
 
				 out:
			
 
				 	return ret;
			
@@ -4156,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
			
 
				 	inode_inc_iversion(dir);
			
 
				-	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
			
 
				+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
			
 
				 	ret = btrfs_update_inode_fallback(trans, root, dir);
			
 
				 	if (ret)
			
 
				 		btrfs_abort_transaction(trans, root, ret);
			
@@ -4211,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				+	/*
			
 
				+	 * This is only used to apply pressure to the enospc system, we don't
			
 
				+	 * intend to use this reservation at all.
			
 
				+	 */
			
 
				 	bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
			
 
				+	bytes_deleted *= root->nodesize;
			
 
				 	ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
			
 
				 				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
			
 
				-	if (!ret)
			
 
				+	if (!ret) {
			
 
				+		trace_btrfs_space_reservation(root->fs_info, "transaction",
			
 
				+					      trans->transid,
			
 
				+					      bytes_deleted, 1);
			
 
				 		trans->bytes_reserved += bytes_deleted;
			
 
				+	}
			
 
				 	return ret;
			
 
				 
			
 
				 }
			
@@ -4248,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode,
 
				 		 * read the extent item from disk (data not in the page cache).
			
 
				 		 */
			
 
				 		btrfs_release_path(path);
			
 
				-		return btrfs_truncate_page(inode, offset, page_end - offset, 0);
			
 
				+		return btrfs_truncate_block(inode, offset, page_end - offset,
			
 
				+					0);
			
 
				 	}
			
 
				 
			
 
				 	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
			
@@ -4601,17 +4613,17 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * btrfs_truncate_page - read, zero a chunk and write a page
			
 
				+ * btrfs_truncate_block - read, zero a chunk and write a block
			
 
				  * @inode - inode that we're zeroing
			
 
				  * @from - the offset to start zeroing
			
 
				  * @len - the length to zero, 0 to zero the entire range respective to the
			
 
				  *	offset
			
 
				  * @front - zero up to the offset instead of from the offset on
			
 
				  *
			
 
				- * This will find the page for the "from" offset and cow the page and zero the
			
 
				+ * This will find the block for the "from" offset and cow the block and zero the
			
 
				  * part we want to zero.  This is used with truncate and hole punching.
			
 
				  */
			
 
				-int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
			
 
				+int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
			
 
				 			int front)
			
 
				 {
			
 
				 	struct address_space *mapping = inode->i_mapping;
			
@@ -4622,18 +4634,19 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 
				 	char *kaddr;
			
 
				 	u32 blocksize = root->sectorsize;
			
 
				 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
			
 
				-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
			
 
				+	unsigned offset = from & (blocksize - 1);
			
 
				 	struct page *page;
			
 
				 	gfp_t mask = btrfs_alloc_write_mask(mapping);
			
 
				 	int ret = 0;
			
 
				-	u64 page_start;
			
 
				-	u64 page_end;
			
 
				+	u64 block_start;
			
 
				+	u64 block_end;
			
 
				 
			
 
				 	if ((offset & (blocksize - 1)) == 0 &&
			
 
				 	    (!len || ((len & (blocksize - 1)) == 0)))
			
 
				 		goto out;
			
 
				+
			
 
				 	ret = btrfs_delalloc_reserve_space(inode,
			
 
				-			round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
			
 
				+			round_down(from, blocksize), blocksize);
			
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 
			
@@ -4641,14 +4654,14 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 
				 	page = find_or_create_page(mapping, index, mask);
			
 
				 	if (!page) {
			
 
				 		btrfs_delalloc_release_space(inode,
			
 
				-				round_down(from, PAGE_CACHE_SIZE),
			
 
				-				PAGE_CACHE_SIZE);
			
 
				+				round_down(from, blocksize),
			
 
				+				blocksize);
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	page_start = page_offset(page);
			
 
				-	page_end = page_start + PAGE_CACHE_SIZE - 1;
			
 
				+	block_start = round_down(from, blocksize);
			
 
				+	block_end = block_start + blocksize - 1;
			
 
				 
			
 
				 	if (!PageUptodate(page)) {
			
 
				 		ret = btrfs_readpage(NULL, page);
			
@@ -4665,12 +4678,12 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 
				 	}
			
 
				 	wait_on_page_writeback(page);
			
 
				 
			
 
				-	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
			
 
				+	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
			
 
				 	set_page_extent_mapped(page);
			
 
				 
			
 
				-	ordered = btrfs_lookup_ordered_extent(inode, page_start);
			
 
				+	ordered = btrfs_lookup_ordered_extent(inode, block_start);
			
 
				 	if (ordered) {
			
 
				-		unlock_extent_cached(io_tree, page_start, page_end,
			
 
				+		unlock_extent_cached(io_tree, block_start, block_end,
			
 
				 				     &cached_state, GFP_NOFS);
			
 
				 		unlock_page(page);
			
 
				 		page_cache_release(page);
			
@@ -4679,39 +4692,41 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 
				 		goto again;
			
 
				 	}
			
 
				 
			
 
				-	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
			
 
				+	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
			
 
				 			  EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
			
 
				 			  0, 0, &cached_state, GFP_NOFS);
			
 
				 
			
 
				-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
			
 
				+	ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
			
 
				 					&cached_state);
			
 
				 	if (ret) {
			
 
				-		unlock_extent_cached(io_tree, page_start, page_end,
			
 
				+		unlock_extent_cached(io_tree, block_start, block_end,
			
 
				 				     &cached_state, GFP_NOFS);
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				-	if (offset != PAGE_CACHE_SIZE) {
			
 
				+	if (offset != blocksize) {
			
 
				 		if (!len)
			
 
				-			len = PAGE_CACHE_SIZE - offset;
			
 
				+			len = blocksize - offset;
			
 
				 		kaddr = kmap(page);
			
 
				 		if (front)
			
 
				-			memset(kaddr, 0, offset);
			
 
				+			memset(kaddr + (block_start - page_offset(page)),
			
 
				+				0, offset);
			
 
				 		else
			
 
				-			memset(kaddr + offset, 0, len);
			
 
				+			memset(kaddr + (block_start - page_offset(page)) +  offset,
			
 
				+				0, len);
			
 
				 		flush_dcache_page(page);
			
 
				 		kunmap(page);
			
 
				 	}
			
 
				 	ClearPageChecked(page);
			
 
				 	set_page_dirty(page);
			
 
				-	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
			
 
				+	unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
			
 
				 			     GFP_NOFS);
			
 
				 
			
 
				 out_unlock:
			
 
				 	if (ret)
			
 
				-		btrfs_delalloc_release_space(inode, page_start,
			
 
				-					     PAGE_CACHE_SIZE);
			
 
				+		btrfs_delalloc_release_space(inode, block_start,
			
 
				+					     blocksize);
			
 
				 	unlock_page(page);
			
 
				 	page_cache_release(page);
			
 
				 out:
			
@@ -4782,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
				 	int err = 0;
			
 
				 
			
 
				 	/*
			
 
				-	 * If our size started in the middle of a page we need to zero out the
			
 
				-	 * rest of the page before we expand the i_size, otherwise we could
			
 
				+	 * If our size started in the middle of a block we need to zero out the
			
 
				+	 * rest of the block before we expand the i_size, otherwise we could
			
 
				 	 * expose stale data.
			
 
				 	 */
			
 
				-	err = btrfs_truncate_page(inode, oldsize, 0, 0);
			
 
				+	err = btrfs_truncate_block(inode, oldsize, 0, 0);
			
 
				 	if (err)
			
 
				 		return err;
			
 
				 
			
@@ -4895,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
				 	}
			
 
				 
			
 
				 	if (newsize > oldsize) {
			
 
				-		truncate_pagecache(inode, newsize);
			
 
				 		/*
			
 
				 		 * Don't do an expanding truncate while snapshoting is ongoing.
			
 
				 		 * This is to ensure the snapshot captures a fully consistent
			
@@ -4918,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
				 
			
 
				 		i_size_write(inode, newsize);
			
 
				 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
			
 
				+		pagecache_isize_extended(inode, oldsize, newsize);
			
 
				 		ret = btrfs_update_inode(trans, root, inode);
			
 
				 		btrfs_end_write_no_snapshoting(root);
			
 
				 		btrfs_end_transaction(trans, root);
			
@@ -5588,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s,
 
				 	inode->i_op = &btrfs_dir_ro_inode_operations;
			
 
				 	inode->i_fop = &simple_dir_operations;
			
 
				 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
			
 
				-	inode->i_mtime = CURRENT_TIME;
			
 
				+	inode->i_mtime = current_fs_time(inode->i_sb);
			
 
				 	inode->i_atime = inode->i_mtime;
			
 
				 	inode->i_ctime = inode->i_mtime;
			
 
				 	BTRFS_I(inode)->i_otime = inode->i_mtime;
			
@@ -5790,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 
				 			if (name_len <= sizeof(tmp_name)) {
			
 
				 				name_ptr = tmp_name;
			
 
				 			} else {
			
 
				-				name_ptr = kmalloc(name_len, GFP_NOFS);
			
 
				+				name_ptr = kmalloc(name_len, GFP_KERNEL);
			
 
				 				if (!name_ptr) {
			
 
				 					ret = -ENOMEM;
			
 
				 					goto err;
			
@@ -6172,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
				 	inode_init_owner(inode, dir, mode);
			
 
				 	inode_set_bytes(inode, 0);
			
 
				 
			
 
				-	inode->i_mtime = CURRENT_TIME;
			
 
				+	inode->i_mtime = current_fs_time(inode->i_sb);
			
 
				 	inode->i_atime = inode->i_mtime;
			
 
				 	inode->i_ctime = inode->i_mtime;
			
 
				 	BTRFS_I(inode)->i_otime = inode->i_mtime;
			
@@ -6285,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 
				 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
			
 
				 			   name_len * 2);
			
 
				 	inode_inc_iversion(parent_inode);
			
 
				-	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
			
 
				+	parent_inode->i_mtime = parent_inode->i_ctime =
			
 
				+		current_fs_time(parent_inode->i_sb);
			
 
				 	ret = btrfs_update_inode(trans, root, parent_inode);
			
 
				 	if (ret)
			
 
				 		btrfs_abort_transaction(trans, root, ret);
			
@@ -6503,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
				 	BTRFS_I(inode)->dir_index = 0ULL;
			
 
				 	inc_nlink(inode);
			
 
				 	inode_inc_iversion(inode);
			
 
				-	inode->i_ctime = CURRENT_TIME;
			
 
				+	inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 	ihold(inode);
			
 
				 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
			
 
				 
			
@@ -7414,7 +7430,26 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
				 				     cached_state, GFP_NOFS);
			
 
				 
			
 
				 		if (ordered) {
			
 
				-			btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				+			/*
			
 
				+			 * If we are doing a DIO read and the ordered extent we
			
 
				+			 * found is for a buffered write, we can not wait for it
			
 
				+			 * to complete and retry, because if we do so we can
			
 
				+			 * deadlock with concurrent buffered writes on page
			
 
				+			 * locks. This happens only if our DIO read covers more
			
 
				+			 * than one extent map, if at this point has already
			
 
				+			 * created an ordered extent for a previous extent map
			
 
				+			 * and locked its range in the inode's io tree, and a
			
 
				+			 * concurrent write against that previous extent map's
			
 
				+			 * range and this range started (we unlock the ranges
			
 
				+			 * in the io tree only when the bios complete and
			
 
				+			 * buffered writes always lock pages before attempting
			
 
				+			 * to lock range in the io tree).
			
 
				+			 */
			
 
				+			if (writing ||
			
 
				+			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
			
 
				+				btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				+			else
			
 
				+				ret = -ENOTBLK;
			
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 		} else {
			
 
				 			/*
			
@@ -7431,9 +7466,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
				 			 * that page.
			
 
				 			 */
			
 
				 			ret = -ENOTBLK;
			
 
				-			break;
			
 
				 		}
			
 
				 
			
 
				+		if (ret)
			
 
				+			break;
			
 
				+
			
 
				 		cond_resched();
			
 
				 	}
			
 
				 
			
@@ -7764,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode,
 
				 }
			
 
				 
			
 
				 static int dio_read_error(struct inode *inode, struct bio *failed_bio,
			
 
				-			  struct page *page, u64 start, u64 end,
			
 
				-			  int failed_mirror, bio_end_io_t *repair_endio,
			
 
				-			  void *repair_arg)
			
 
				+			struct page *page, unsigned int pgoff,
			
 
				+			u64 start, u64 end, int failed_mirror,
			
 
				+			bio_end_io_t *repair_endio, void *repair_arg)
			
 
				 {
			
 
				 	struct io_failure_record *failrec;
			
 
				 	struct bio *bio;
			
@@ -7787,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 
				 		return -EIO;
			
 
				 	}
			
 
				 
			
 
				-	if (failed_bio->bi_vcnt > 1)
			
 
				+	if ((failed_bio->bi_vcnt > 1)
			
 
				+		|| (failed_bio->bi_io_vec->bv_len
			
 
				+			> BTRFS_I(inode)->root->sectorsize))
			
 
				 		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
			
 
				 	else
			
 
				 		read_mode = READ_SYNC;
			
@@ -7795,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 
				 	isector = start - btrfs_io_bio(failed_bio)->logical;
			
 
				 	isector >>= inode->i_sb->s_blocksize_bits;
			
 
				 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
			
 
				-				      0, isector, repair_endio, repair_arg);
			
 
				+				pgoff, isector, repair_endio, repair_arg);
			
 
				 	if (!bio) {
			
 
				 		free_io_failure(inode, failrec);
			
 
				 		return -EIO;
			
@@ -7825,12 +7864,17 @@ struct btrfs_retry_complete {
 
				 static void btrfs_retry_endio_nocsum(struct bio *bio)
			
 
				 {
			
 
				 	struct btrfs_retry_complete *done = bio->bi_private;
			
 
				+	struct inode *inode;
			
 
				 	struct bio_vec *bvec;
			
 
				 	int i;
			
 
				 
			
 
				 	if (bio->bi_error)
			
 
				 		goto end;
			
 
				 
			
 
				+	ASSERT(bio->bi_vcnt == 1);
			
 
				+	inode = bio->bi_io_vec->bv_page->mapping->host;
			
 
				+	ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
			
 
				+
			
 
				 	done->uptodate = 1;
			
 
				 	bio_for_each_segment_all(bvec, bio, i)
			
 
				 		clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
			
@@ -7842,25 +7886,35 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 
				 static int __btrfs_correct_data_nocsum(struct inode *inode,
			
 
				 				       struct btrfs_io_bio *io_bio)
			
 
				 {
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				 	struct bio_vec *bvec;
			
 
				 	struct btrfs_retry_complete done;
			
 
				 	u64 start;
			
 
				+	unsigned int pgoff;
			
 
				+	u32 sectorsize;
			
 
				+	int nr_sectors;
			
 
				 	int i;
			
 
				 	int ret;
			
 
				 
			
 
				+	fs_info = BTRFS_I(inode)->root->fs_info;
			
 
				+	sectorsize = BTRFS_I(inode)->root->sectorsize;
			
 
				+
			
 
				 	start = io_bio->logical;
			
 
				 	done.inode = inode;
			
 
				 
			
 
				 	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
			
 
				-try_again:
			
 
				+		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
			
 
				+		pgoff = bvec->bv_offset;
			
 
				+
			
 
				+next_block_or_try_again:
			
 
				 		done.uptodate = 0;
			
 
				 		done.start = start;
			
 
				 		init_completion(&done.done);
			
 
				 
			
 
				-		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
			
 
				-				     start + bvec->bv_len - 1,
			
 
				-				     io_bio->mirror_num,
			
 
				-				     btrfs_retry_endio_nocsum, &done);
			
 
				+		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
			
 
				+				pgoff, start, start + sectorsize - 1,
			
 
				+				io_bio->mirror_num,
			
 
				+				btrfs_retry_endio_nocsum, &done);
			
 
				 		if (ret)
			
 
				 			return ret;
			
 
				 
			
@@ -7868,10 +7922,15 @@ static int __btrfs_correct_data_nocsum(struct inode *inode,
 
				 
			
 
				 		if (!done.uptodate) {
			
 
				 			/* We might have another mirror, so try again */
			
 
				-			goto try_again;
			
 
				+			goto next_block_or_try_again;
			
 
				 		}
			
 
				 
			
 
				-		start += bvec->bv_len;
			
 
				+		start += sectorsize;
			
 
				+
			
 
				+		if (nr_sectors--) {
			
 
				+			pgoff += sectorsize;
			
 
				+			goto next_block_or_try_again;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -7881,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio)
 
				 {
			
 
				 	struct btrfs_retry_complete *done = bio->bi_private;
			
 
				 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
			
 
				+	struct inode *inode;
			
 
				 	struct bio_vec *bvec;
			
 
				+	u64 start;
			
 
				 	int uptodate;
			
 
				 	int ret;
			
 
				 	int i;
			
@@ -7890,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio)
 
				 		goto end;
			
 
				 
			
 
				 	uptodate = 1;
			
 
				+
			
 
				+	start = done->start;
			
 
				+
			
 
				+	ASSERT(bio->bi_vcnt == 1);
			
 
				+	inode = bio->bi_io_vec->bv_page->mapping->host;
			
 
				+	ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
			
 
				+
			
 
				 	bio_for_each_segment_all(bvec, bio, i) {
			
 
				 		ret = __readpage_endio_check(done->inode, io_bio, i,
			
 
				-					     bvec->bv_page, 0,
			
 
				-					     done->start, bvec->bv_len);
			
 
				+					bvec->bv_page, bvec->bv_offset,
			
 
				+					done->start, bvec->bv_len);
			
 
				 		if (!ret)
			
 
				 			clean_io_failure(done->inode, done->start,
			
 
				-					 bvec->bv_page, 0);
			
 
				+					bvec->bv_page, bvec->bv_offset);
			
 
				 		else
			
 
				 			uptodate = 0;
			
 
				 	}
			
@@ -7910,20 +7978,34 @@ static void btrfs_retry_endio(struct bio *bio)
 
				 static int __btrfs_subio_endio_read(struct inode *inode,
			
 
				 				    struct btrfs_io_bio *io_bio, int err)
			
 
				 {
			
 
				+	struct btrfs_fs_info *fs_info;
			
 
				 	struct bio_vec *bvec;
			
 
				 	struct btrfs_retry_complete done;
			
 
				 	u64 start;
			
 
				 	u64 offset = 0;
			
 
				+	u32 sectorsize;
			
 
				+	int nr_sectors;
			
 
				+	unsigned int pgoff;
			
 
				+	int csum_pos;
			
 
				 	int i;
			
 
				 	int ret;
			
 
				 
			
 
				+	fs_info = BTRFS_I(inode)->root->fs_info;
			
 
				+	sectorsize = BTRFS_I(inode)->root->sectorsize;
			
 
				+
			
 
				 	err = 0;
			
 
				 	start = io_bio->logical;
			
 
				 	done.inode = inode;
			
 
				 
			
 
				 	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
			
 
				-		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
			
 
				-					     0, start, bvec->bv_len);
			
 
				+		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
			
 
				+
			
 
				+		pgoff = bvec->bv_offset;
			
 
				+next_block:
			
 
				+		csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
			
 
				+		ret = __readpage_endio_check(inode, io_bio, csum_pos,
			
 
				+					bvec->bv_page, pgoff, start,
			
 
				+					sectorsize);
			
 
				 		if (likely(!ret))
			
 
				 			goto next;
			
 
				 try_again:
			
@@ -7931,10 +8013,10 @@ static int __btrfs_subio_endio_read(struct inode *inode,
 
				 		done.start = start;
			
 
				 		init_completion(&done.done);
			
 
				 
			
 
				-		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
			
 
				-				     start + bvec->bv_len - 1,
			
 
				-				     io_bio->mirror_num,
			
 
				-				     btrfs_retry_endio, &done);
			
 
				+		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
			
 
				+				pgoff, start, start + sectorsize - 1,
			
 
				+				io_bio->mirror_num,
			
 
				+				btrfs_retry_endio, &done);
			
 
				 		if (ret) {
			
 
				 			err = ret;
			
 
				 			goto next;
			
@@ -7947,8 +8029,15 @@ static int __btrfs_subio_endio_read(struct inode *inode,
 
				 			goto try_again;
			
 
				 		}
			
 
				 next:
			
 
				-		offset += bvec->bv_len;
			
 
				-		start += bvec->bv_len;
			
 
				+		offset += sectorsize;
			
 
				+		start += sectorsize;
			
 
				+
			
 
				+		ASSERT(nr_sectors);
			
 
				+
			
 
				+		if (--nr_sectors) {
			
 
				+			pgoff += sectorsize;
			
 
				+			goto next_block;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	return err;
			
@@ -8202,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 	u64 file_offset = dip->logical_offset;
			
 
				 	u64 submit_len = 0;
			
 
				 	u64 map_length;
			
 
				-	int nr_pages = 0;
			
 
				-	int ret;
			
 
				+	u32 blocksize = root->sectorsize;
			
 
				 	int async_submit = 0;
			
 
				+	int nr_sectors;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				 
			
 
				 	map_length = orig_bio->bi_iter.bi_size;
			
 
				 	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
			
@@ -8234,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 	atomic_inc(&dip->pending_bios);
			
 
				 
			
 
				 	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
			
 
				-		if (map_length < submit_len + bvec->bv_len ||
			
 
				-		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
			
 
				-				 bvec->bv_offset) < bvec->bv_len) {
			
 
				+		nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
			
 
				+		i = 0;
			
 
				+next_block:
			
 
				+		if (unlikely(map_length < submit_len + blocksize ||
			
 
				+		    bio_add_page(bio, bvec->bv_page, blocksize,
			
 
				+			    bvec->bv_offset + (i * blocksize)) < blocksize)) {
			
 
				 			/*
			
 
				 			 * inc the count before we submit the bio so
			
 
				 			 * we know the end IO handler won't happen before
			
@@ -8257,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 			file_offset += submit_len;
			
 
				 
			
 
				 			submit_len = 0;
			
 
				-			nr_pages = 0;
			
 
				 
			
 
				 			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
			
 
				 						  start_sector, GFP_NOFS);
			
@@ -8275,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
				 				bio_put(bio);
			
 
				 				goto out_err;
			
 
				 			}
			
 
				+
			
 
				+			goto next_block;
			
 
				 		} else {
			
 
				-			submit_len += bvec->bv_len;
			
 
				-			nr_pages++;
			
 
				+			submit_len += blocksize;
			
 
				+			if (--nr_sectors) {
			
 
				+				i++;
			
 
				+				goto next_block;
			
 
				+			}
			
 
				 			bvec++;
			
 
				 		}
			
 
				 	}
			
@@ -8642,6 +8740,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 
				 	struct extent_state *cached_state = NULL;
			
 
				 	u64 page_start = page_offset(page);
			
 
				 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
			
 
				+	u64 start;
			
 
				+	u64 end;
			
 
				 	int inode_evicting = inode->i_state & I_FREEING;
			
 
				 
			
 
				 	/*
			
@@ -8661,14 +8761,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 
				 
			
 
				 	if (!inode_evicting)
			
 
				 		lock_extent_bits(tree, page_start, page_end, &cached_state);
			
 
				-	ordered = btrfs_lookup_ordered_extent(inode, page_start);
			
 
				+again:
			
 
				+	start = page_start;
			
 
				+	ordered = btrfs_lookup_ordered_range(inode, start,
			
 
				+					page_end - start + 1);
			
 
				 	if (ordered) {
			
 
				+		end = min(page_end, ordered->file_offset + ordered->len - 1);
			
 
				 		/*
			
 
				 		 * IO on this page will never be started, so we need
			
 
				 		 * to account for any ordered extents now
			
 
				 		 */
			
 
				 		if (!inode_evicting)
			
 
				-			clear_extent_bit(tree, page_start, page_end,
			
 
				+			clear_extent_bit(tree, start, end,
			
 
				 					 EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
			
 
				 					 EXTENT_DEFRAG, 1, 0, &cached_state,
			
@@ -8685,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 
				 
			
 
				 			spin_lock_irq(&tree->lock);
			
 
				 			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
			
 
				-			new_len = page_start - ordered->file_offset;
			
 
				+			new_len = start - ordered->file_offset;
			
 
				 			if (new_len < ordered->truncated_len)
			
 
				 				ordered->truncated_len = new_len;
			
 
				 			spin_unlock_irq(&tree->lock);
			
 
				 
			
 
				 			if (btrfs_dec_test_ordered_pending(inode, &ordered,
			
 
				-							   page_start,
			
 
				-							   PAGE_CACHE_SIZE, 1))
			
 
				+							   start,
			
 
				+							   end - start + 1, 1))
			
 
				 				btrfs_finish_ordered_io(ordered);
			
 
				 		}
			
 
				 		btrfs_put_ordered_extent(ordered);
			
 
				 		if (!inode_evicting) {
			
 
				 			cached_state = NULL;
			
 
				-			lock_extent_bits(tree, page_start, page_end,
			
 
				+			lock_extent_bits(tree, start, end,
			
 
				 					 &cached_state);
			
 
				 		}
			
 
				+
			
 
				+		start = end + 1;
			
 
				+		if (start < page_end)
			
 
				+			goto again;
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -8761,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	loff_t size;
			
 
				 	int ret;
			
 
				 	int reserved = 0;
			
 
				+	u64 reserved_space;
			
 
				 	u64 page_start;
			
 
				 	u64 page_end;
			
 
				+	u64 end;
			
 
				+
			
 
				+	reserved_space = PAGE_CACHE_SIZE;
			
 
				 
			
 
				 	sb_start_pagefault(inode->i_sb);
			
 
				 	page_start = page_offset(page);
			
 
				 	page_end = page_start + PAGE_CACHE_SIZE - 1;
			
 
				+	end = page_end;
			
 
				 
			
 
				+	/*
			
 
				+	 * Reserving delalloc space after obtaining the page lock can lead to
			
 
				+	 * deadlock. For example, if a dirty page is locked by this function
			
 
				+	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
			
 
				+	 * dirty page write out, then the btrfs_writepage() function could
			
 
				+	 * end up waiting indefinitely to get a lock on the page currently
			
 
				+	 * being processed by btrfs_page_mkwrite() function.
			
 
				+	 */
			
 
				 	ret = btrfs_delalloc_reserve_space(inode, page_start,
			
 
				-					   PAGE_CACHE_SIZE);
			
 
				+					   reserved_space);
			
 
				 	if (!ret) {
			
 
				 		ret = file_update_time(vma->vm_file);
			
 
				 		reserved = 1;
			
@@ -8803,7 +8924,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	 * we can't set the delalloc bits if there are pending ordered
			
 
				 	 * extents.  Drop our locks and wait for them to finish
			
 
				 	 */
			
 
				-	ordered = btrfs_lookup_ordered_extent(inode, page_start);
			
 
				+	ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
			
 
				 	if (ordered) {
			
 
				 		unlock_extent_cached(io_tree, page_start, page_end,
			
 
				 				     &cached_state, GFP_NOFS);
			
@@ -8813,6 +8934,18 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		goto again;
			
 
				 	}
			
 
				 
			
 
				+	if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
			
 
				+		reserved_space = round_up(size - page_start, root->sectorsize);
			
 
				+		if (reserved_space < PAGE_CACHE_SIZE) {
			
 
				+			end = page_start + reserved_space - 1;
			
 
				+			spin_lock(&BTRFS_I(inode)->lock);
			
 
				+			BTRFS_I(inode)->outstanding_extents++;
			
 
				+			spin_unlock(&BTRFS_I(inode)->lock);
			
 
				+			btrfs_delalloc_release_space(inode, page_start,
			
 
				+						PAGE_CACHE_SIZE - reserved_space);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * XXX - page_mkwrite gets called every time the page is dirtied, even
			
 
				 	 * if it was already dirty, so for space accounting reasons we need to
			
@@ -8820,12 +8953,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	 * is probably a better way to do this, but for now keep consistent with
			
 
				 	 * prepare_pages in the normal write path.
			
 
				 	 */
			
 
				-	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
			
 
				+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
			
 
				 			  EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
			
 
				 			  0, 0, &cached_state, GFP_NOFS);
			
 
				 
			
 
				-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
			
 
				+	ret = btrfs_set_extent_delalloc(inode, page_start, end,
			
 
				 					&cached_state);
			
 
				 	if (ret) {
			
 
				 		unlock_extent_cached(io_tree, page_start, page_end,
			
@@ -8864,7 +8997,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	}
			
 
				 	unlock_page(page);
			
 
				 out:
			
 
				-	btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
			
 
				+	btrfs_delalloc_release_space(inode, page_start, reserved_space);
			
 
				 out_noreserve:
			
 
				 	sb_end_pagefault(inode->i_sb);
			
 
				 	return ret;
			
@@ -9190,16 +9323,11 @@ void btrfs_destroy_cachep(void)
 
				 	 * destroy cache.
			
 
				 	 */
			
 
				 	rcu_barrier();
			
 
				-	if (btrfs_inode_cachep)
			
 
				-		kmem_cache_destroy(btrfs_inode_cachep);
			
 
				-	if (btrfs_trans_handle_cachep)
			
 
				-		kmem_cache_destroy(btrfs_trans_handle_cachep);
			
 
				-	if (btrfs_transaction_cachep)
			
 
				-		kmem_cache_destroy(btrfs_transaction_cachep);
			
 
				-	if (btrfs_path_cachep)
			
 
				-		kmem_cache_destroy(btrfs_path_cachep);
			
 
				-	if (btrfs_free_space_cachep)
			
 
				-		kmem_cache_destroy(btrfs_free_space_cachep);
			
 
				+	kmem_cache_destroy(btrfs_inode_cachep);
			
 
				+	kmem_cache_destroy(btrfs_trans_handle_cachep);
			
 
				+	kmem_cache_destroy(btrfs_transaction_cachep);
			
 
				+	kmem_cache_destroy(btrfs_path_cachep);
			
 
				+	kmem_cache_destroy(btrfs_free_space_cachep);
			
 
				 }
			
 
				 
			
 
				 int btrfs_init_cachep(void)
			
@@ -9250,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt,
 
				 
			
 
				 	generic_fillattr(inode, stat);
			
 
				 	stat->dev = BTRFS_I(inode)->root->anon_dev;
			
 
				-	stat->blksize = PAGE_CACHE_SIZE;
			
 
				 
			
 
				 	spin_lock(&BTRFS_I(inode)->lock);
			
 
				 	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
			
@@ -9268,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
			
 
				 	struct inode *new_inode = d_inode(new_dentry);
			
 
				 	struct inode *old_inode = d_inode(old_dentry);
			
 
				-	struct timespec ctime = CURRENT_TIME;
			
 
				 	u64 index = 0;
			
 
				 	u64 root_objectid;
			
 
				 	int ret;
			
@@ -9365,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 	inode_inc_iversion(old_dir);
			
 
				 	inode_inc_iversion(new_dir);
			
 
				 	inode_inc_iversion(old_inode);
			
 
				-	old_dir->i_ctime = old_dir->i_mtime = ctime;
			
 
				-	new_dir->i_ctime = new_dir->i_mtime = ctime;
			
 
				-	old_inode->i_ctime = ctime;
			
 
				+	old_dir->i_ctime = old_dir->i_mtime =
			
 
				+	new_dir->i_ctime = new_dir->i_mtime =
			
 
				+	old_inode->i_ctime = current_fs_time(old_dir->i_sb);
			
 
				 
			
 
				 	if (old_dentry->d_parent != new_dentry->d_parent)
			
 
				 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
			
@@ -9392,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 
			
 
				 	if (new_inode) {
			
 
				 		inode_inc_iversion(new_inode);
			
 
				-		new_inode->i_ctime = CURRENT_TIME;
			
 
				+		new_inode->i_ctime = current_fs_time(new_inode->i_sb);
			
 
				 		if (unlikely(btrfs_ino(new_inode) ==
			
 
				 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
			
 
				 			root_objectid = BTRFS_I(new_inode)->location.objectid;
			
@@ -9870,7 +9996,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
				 		*alloc_hint = ins.objectid + ins.offset;
			
 
				 
			
 
				 		inode_inc_iversion(inode);
			
 
				-		inode->i_ctime = CURRENT_TIME;
			
 
				+		inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
			
 
				 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
			
 
				 		    (actual_len > inode->i_size) &&
			
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,8 @@
 
				 #include "props.h"
			
 
				 #include "sysfs.h"
			
 
				 #include "qgroup.h"
			
 
				+#include "tree-log.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 #ifdef CONFIG_64BIT
			
 
				 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
			
@@ -347,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
				 
			
 
				 	btrfs_update_iflags(inode);
			
 
				 	inode_inc_iversion(inode);
			
 
				-	inode->i_ctime = CURRENT_TIME;
			
 
				+	inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 	ret = btrfs_update_inode(trans, root, inode);
			
 
				 
			
 
				 	btrfs_end_transaction(trans, root);
			
@@ -443,7 +445,7 @@ static noinline int create_subvol(struct inode *dir,
 
				 	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				 	struct btrfs_root *new_root;
			
 
				 	struct btrfs_block_rsv block_rsv;
			
 
				-	struct timespec cur_time = CURRENT_TIME;
			
 
				+	struct timespec cur_time = current_fs_time(dir->i_sb);
			
 
				 	struct inode *inode;
			
 
				 	int ret;
			
 
				 	int err;
			
@@ -844,10 +846,6 @@ static noinline int btrfs_mksubvol(struct path *parent,
 
				 	if (IS_ERR(dentry))
			
 
				 		goto out_unlock;
			
 
				 
			
 
				-	error = -EEXIST;
			
 
				-	if (d_really_is_positive(dentry))
			
 
				-		goto out_dput;
			
 
				-
			
 
				 	error = btrfs_may_create(dir, dentry);
			
 
				 	if (error)
			
 
				 		goto out_dput;
			
@@ -2097,8 +2095,6 @@ static noinline int search_ioctl(struct inode *inode,
 
				 		key.offset = (u64)-1;
			
 
				 		root = btrfs_read_fs_root_no_name(info, &key);
			
 
				 		if (IS_ERR(root)) {
			
 
				-			btrfs_err(info, "could not find root %llu",
			
 
				-			       sk->tree_id);
			
 
				 			btrfs_free_path(path);
			
 
				 			return -ENOENT;
			
 
				 		}
			
@@ -2476,6 +2472,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
				 	trans->block_rsv = &block_rsv;
			
 
				 	trans->bytes_reserved = block_rsv.size;
			
 
				 
			
 
				+	btrfs_record_snapshot_destroy(trans, dir);
			
 
				+
			
 
				 	ret = btrfs_unlink_subvol(trans, root, dir,
			
 
				 				dest->root_key.objectid,
			
 
				 				dentry->d_name.name,
			
@@ -2960,8 +2958,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
 
				 	 * of the array is bounded by len, which is in turn bounded by
			
 
				 	 * BTRFS_MAX_DEDUPE_LEN.
			
 
				 	 */
			
 
				-	src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
			
 
				-	dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
			
 
				+	src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
			
 
				+	dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
			
 
				 	if (!src_pgarr || !dst_pgarr) {
			
 
				 		kfree(src_pgarr);
			
 
				 		kfree(dst_pgarr);
			
@@ -3066,6 +3064,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 
				 		inode_lock(src);
			
 
				 
			
 
				 		ret = extent_same_check_offsets(src, loff, &len, olen);
			
 
				+		if (ret)
			
 
				+			goto out_unlock;
			
 
				+		ret = extent_same_check_offsets(src, dst_loff, &len, olen);
			
 
				 		if (ret)
			
 
				 			goto out_unlock;
			
 
				 
			
@@ -3217,7 +3218,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	inode_inc_iversion(inode);
			
 
				 	if (!no_time_update)
			
 
				-		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
			
 
				+		inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 	/*
			
 
				 	 * We round up to the block size at eof when determining which
			
 
				 	 * extents to clone above, but shouldn't round up the file size.
			
@@ -3889,8 +3890,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 
				 	 * Truncate page cache pages so that future reads will see the cloned
			
 
				 	 * data immediately and not the previous data.
			
 
				 	 */
			
 
				-	truncate_inode_pages_range(&inode->i_data, destoff,
			
 
				-				   PAGE_CACHE_ALIGN(destoff + len) - 1);
			
 
				+	truncate_inode_pages_range(&inode->i_data,
			
 
				+				round_down(destoff, PAGE_CACHE_SIZE),
			
 
				+				round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
			
 
				 out_unlock:
			
 
				 	if (!same_inode)
			
 
				 		btrfs_double_inode_unlock(src, inode);
			
@@ -5031,7 +5033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct btrfs_root_item *root_item = &root->root_item;
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				-	struct timespec ct = CURRENT_TIME;
			
 
				+	struct timespec ct = current_fs_time(inode->i_sb);
			
 
				 	int ret = 0;
			
 
				 	int received_uuid_changed;
			
 
				 
			
@@ -5262,8 +5264,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
 
				 	  .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
			
 
				 	  .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
			
 
				 
			
 
				-static int btrfs_ioctl_get_supported_features(struct file *file,
			
 
				-					      void __user *arg)
			
 
				+int btrfs_ioctl_get_supported_features(void __user *arg)
			
 
				 {
			
 
				 	static const struct btrfs_ioctl_feature_flags features[3] = {
			
 
				 		INIT_FEATURE_FLAGS(SUPP),
			
@@ -5542,7 +5543,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 
				 	case BTRFS_IOC_SET_FSLABEL:
			
 
				 		return btrfs_ioctl_set_fslabel(file, argp);
			
 
				 	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
			
 
				-		return btrfs_ioctl_get_supported_features(file, argp);
			
 
				+		return btrfs_ioctl_get_supported_features(argp);
			
 
				 	case BTRFS_IOC_GET_FEATURES:
			
 
				 		return btrfs_ioctl_get_features(file, argp);
			
 
				 	case BTRFS_IOC_SET_FEATURES:
			
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
 
				 #include "btrfs_inode.h"
			
 
				 #include "extent_io.h"
			
 
				 #include "disk-io.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 static struct kmem_cache *btrfs_ordered_extent_cache;
			
 
				 
			
@@ -1009,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 
				 	for (; node; node = rb_prev(node)) {
			
 
				 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
			
 
				 
			
 
				-		/* We treat this entry as if it doesnt exist */
			
 
				+		/* We treat this entry as if it doesn't exist */
			
 
				 		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
			
 
				 			continue;
			
 
				 		if (test->file_offset + test->len <= disk_i_size)
			
@@ -1114,6 +1115,5 @@ int __init ordered_data_init(void)
 
				 
			
 
				 void ordered_data_exit(void)
			
 
				 {
			
 
				-	if (btrfs_ordered_extent_cache)
			
 
				-		kmem_cache_destroy(btrfs_ordered_extent_cache);
			
 
				+	kmem_cache_destroy(btrfs_ordered_extent_cache);
			
 
				 }
			
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -295,8 +295,27 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 
				 			       btrfs_dev_extent_chunk_offset(l, dev_extent),
			
 
				 			       btrfs_dev_extent_length(l, dev_extent));
			
 
				 			break;
			
 
				-		case BTRFS_DEV_STATS_KEY:
			
 
				-			printk(KERN_INFO "\t\tdevice stats\n");
			
 
				+		case BTRFS_PERSISTENT_ITEM_KEY:
			
 
				+			printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n",
			
 
				+					key.objectid, key.offset);
			
 
				+			switch (key.objectid) {
			
 
				+			case BTRFS_DEV_STATS_OBJECTID:
			
 
				+				printk(KERN_INFO "\t\tdevice stats\n");
			
 
				+				break;
			
 
				+			default:
			
 
				+				printk(KERN_INFO "\t\tunknown persistent item\n");
			
 
				+			}
			
 
				+			break;
			
 
				+		case BTRFS_TEMPORARY_ITEM_KEY:
			
 
				+			printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n",
			
 
				+					key.objectid, key.offset);
			
 
				+			switch (key.objectid) {
			
 
				+			case BTRFS_BALANCE_OBJECTID:
			
 
				+				printk(KERN_INFO "\t\tbalance status\n");
			
 
				+				break;
			
 
				+			default:
			
 
				+				printk(KERN_INFO "\t\tunknown temporary item\n");
			
 
				+			}
			
 
				 			break;
			
 
				 		case BTRFS_DEV_REPLACE_KEY:
			
 
				 			printk(KERN_INFO "\t\tdev replace\n");
			
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -22,6 +22,7 @@
 
				 #include "hash.h"
			
 
				 #include "transaction.h"
			
 
				 #include "xattr.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 #define BTRFS_PROP_HANDLERS_HT_BITS 8
			
 
				 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
			
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -72,7 +72,7 @@ struct reada_extent {
 
				 	spinlock_t		lock;
			
 
				 	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
			
 
				 	int			nzones;
			
 
				-	struct btrfs_device	*scheduled_for;
			
 
				+	int			scheduled;
			
 
				 };
			
 
				 
			
 
				 struct reada_zone {
			
@@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
 
				 static void __reada_start_machine(struct btrfs_fs_info *fs_info);
			
 
				 
			
 
				 static int reada_add_block(struct reada_control *rc, u64 logical,
			
 
				-			   struct btrfs_key *top, int level, u64 generation);
			
 
				+			   struct btrfs_key *top, u64 generation);
			
 
				 
			
 
				 /* recurses */
			
 
				 /* in case of err, eb might be NULL */
			
 
				-static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				-			    u64 start, int err)
			
 
				+static void __readahead_hook(struct btrfs_fs_info *fs_info,
			
 
				+			     struct reada_extent *re, struct extent_buffer *eb,
			
 
				+			     u64 start, int err)
			
 
				 {
			
 
				 	int level = 0;
			
 
				 	int nritems;
			
 
				 	int i;
			
 
				 	u64 bytenr;
			
 
				 	u64 generation;
			
 
				-	struct reada_extent *re;
			
 
				-	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	struct list_head list;
			
 
				-	unsigned long index = start >> PAGE_CACHE_SHIFT;
			
 
				-	struct btrfs_device *for_dev;
			
 
				 
			
 
				 	if (eb)
			
 
				 		level = btrfs_header_level(eb);
			
 
				 
			
 
				-	/* find extent */
			
 
				-	spin_lock(&fs_info->reada_lock);
			
 
				-	re = radix_tree_lookup(&fs_info->reada_tree, index);
			
 
				-	if (re)
			
 
				-		re->refcnt++;
			
 
				-	spin_unlock(&fs_info->reada_lock);
			
 
				-
			
 
				-	if (!re)
			
 
				-		return -1;
			
 
				-
			
 
				 	spin_lock(&re->lock);
			
 
				 	/*
			
 
				 	 * just take the full list from the extent. afterwards we
			
 
				 	 * don't need the lock anymore
			
 
				 	 */
			
 
				 	list_replace_init(&re->extctl, &list);
			
 
				-	for_dev = re->scheduled_for;
			
 
				-	re->scheduled_for = NULL;
			
 
				+	re->scheduled = 0;
			
 
				 	spin_unlock(&re->lock);
			
 
				 
			
 
				-	if (err == 0) {
			
 
				-		nritems = level ? btrfs_header_nritems(eb) : 0;
			
 
				-		generation = btrfs_header_generation(eb);
			
 
				-		/*
			
 
				-		 * FIXME: currently we just set nritems to 0 if this is a leaf,
			
 
				-		 * effectively ignoring the content. In a next step we could
			
 
				-		 * trigger more readahead depending from the content, e.g.
			
 
				-		 * fetch the checksums for the extents in the leaf.
			
 
				-		 */
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * this is the error case, the extent buffer has not been
			
 
				-		 * read correctly. We won't access anything from it and
			
 
				-		 * just cleanup our data structures. Effectively this will
			
 
				-		 * cut the branch below this node from read ahead.
			
 
				-		 */
			
 
				-		nritems = 0;
			
 
				-		generation = 0;
			
 
				-	}
			
 
				+	/*
			
 
				+	 * this is the error case, the extent buffer has not been
			
 
				+	 * read correctly. We won't access anything from it and
			
 
				+	 * just cleanup our data structures. Effectively this will
			
 
				+	 * cut the branch below this node from read ahead.
			
 
				+	 */
			
 
				+	if (err)
			
 
				+		goto cleanup;
			
 
				 
			
 
				+	/*
			
 
				+	 * FIXME: currently we just set nritems to 0 if this is a leaf,
			
 
				+	 * effectively ignoring the content. In a next step we could
			
 
				+	 * trigger more readahead depending from the content, e.g.
			
 
				+	 * fetch the checksums for the extents in the leaf.
			
 
				+	 */
			
 
				+	if (!level)
			
 
				+		goto cleanup;
			
 
				+
			
 
				+	nritems = btrfs_header_nritems(eb);
			
 
				+	generation = btrfs_header_generation(eb);
			
 
				 	for (i = 0; i < nritems; i++) {
			
 
				 		struct reada_extctl *rec;
			
 
				 		u64 n_gen;
			
@@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 
				 			 */
			
 
				 #ifdef DEBUG
			
 
				 			if (rec->generation != generation) {
			
 
				-				btrfs_debug(root->fs_info,
			
 
				-					   "generation mismatch for (%llu,%d,%llu) %llu != %llu",
			
 
				-				       key.objectid, key.type, key.offset,
			
 
				-				       rec->generation, generation);
			
 
				+				btrfs_debug(fs_info,
			
 
				+					    "generation mismatch for (%llu,%d,%llu) %llu != %llu",
			
 
				+					    key.objectid, key.type, key.offset,
			
 
				+					    rec->generation, generation);
			
 
				 			}
			
 
				 #endif
			
 
				 			if (rec->generation == generation &&
			
 
				 			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
			
 
				 			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
			
 
				-				reada_add_block(rc, bytenr, &next_key,
			
 
				-						level - 1, n_gen);
			
 
				+				reada_add_block(rc, bytenr, &next_key, n_gen);
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+cleanup:
			
 
				 	/*
			
 
				 	 * free extctl records
			
 
				 	 */
			
@@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 
				 
			
 
				 		reada_extent_put(fs_info, re);	/* one ref for each entry */
			
 
				 	}
			
 
				-	reada_extent_put(fs_info, re);	/* our ref */
			
 
				-	if (for_dev)
			
 
				-		atomic_dec(&for_dev->reada_in_flight);
			
 
				 
			
 
				-	return 0;
			
 
				+	return;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * start is passed separately in case eb in NULL, which may be the case with
			
 
				  * failed I/O
			
 
				  */
			
 
				-int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			
 
				-			 u64 start, int err)
			
 
				+int btree_readahead_hook(struct btrfs_fs_info *fs_info,
			
 
				+			 struct extent_buffer *eb, u64 start, int err)
			
 
				 {
			
 
				-	int ret;
			
 
				+	int ret = 0;
			
 
				+	struct reada_extent *re;
			
 
				 
			
 
				-	ret = __readahead_hook(root, eb, start, err);
			
 
				+	/* find extent */
			
 
				+	spin_lock(&fs_info->reada_lock);
			
 
				+	re = radix_tree_lookup(&fs_info->reada_tree,
			
 
				+			       start >> PAGE_CACHE_SHIFT);
			
 
				+	if (re)
			
 
				+		re->refcnt++;
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+	if (!re) {
			
 
				+		ret = -1;
			
 
				+		goto start_machine;
			
 
				+	}
			
 
				 
			
 
				-	reada_start_machine(root->fs_info);
			
 
				+	__readahead_hook(fs_info, re, eb, start, err);
			
 
				+	reada_extent_put(fs_info, re);	/* our ref */
			
 
				 
			
 
				+start_machine:
			
 
				+	reada_start_machine(fs_info);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -260,18 +258,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 
				 	spin_lock(&fs_info->reada_lock);
			
 
				 	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
			
 
				 				     logical >> PAGE_CACHE_SHIFT, 1);
			
 
				-	if (ret == 1)
			
 
				+	if (ret == 1 && logical >= zone->start && logical <= zone->end) {
			
 
				 		kref_get(&zone->refcnt);
			
 
				-	spin_unlock(&fs_info->reada_lock);
			
 
				-
			
 
				-	if (ret == 1) {
			
 
				-		if (logical >= zone->start && logical < zone->end)
			
 
				-			return zone;
			
 
				-		spin_lock(&fs_info->reada_lock);
			
 
				-		kref_put(&zone->refcnt, reada_zone_release);
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				+		return zone;
			
 
				 	}
			
 
				 
			
 
				+	spin_unlock(&fs_info->reada_lock);
			
 
				+
			
 
				 	cache = btrfs_lookup_block_group(fs_info, logical);
			
 
				 	if (!cache)
			
 
				 		return NULL;
			
@@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 
				 	end = start + cache->key.offset - 1;
			
 
				 	btrfs_put_block_group(cache);
			
 
				 
			
 
				-	zone = kzalloc(sizeof(*zone), GFP_NOFS);
			
 
				+	zone = kzalloc(sizeof(*zone), GFP_KERNEL);
			
 
				 	if (!zone)
			
 
				 		return NULL;
			
 
				 
			
@@ -307,8 +301,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 
				 		kfree(zone);
			
 
				 		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
			
 
				 					     logical >> PAGE_CACHE_SHIFT, 1);
			
 
				-		if (ret == 1)
			
 
				+		if (ret == 1 && logical >= zone->start && logical <= zone->end)
			
 
				 			kref_get(&zone->refcnt);
			
 
				+		else
			
 
				+			zone = NULL;
			
 
				 	}
			
 
				 	spin_unlock(&fs_info->reada_lock);
			
 
				 
			
@@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 
				 
			
 
				 static struct reada_extent *reada_find_extent(struct btrfs_root *root,
			
 
				 					      u64 logical,
			
 
				-					      struct btrfs_key *top, int level)
			
 
				+					      struct btrfs_key *top)
			
 
				 {
			
 
				 	int ret;
			
 
				 	struct reada_extent *re = NULL;
			
@@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 	u64 length;
			
 
				 	int real_stripes;
			
 
				 	int nzones = 0;
			
 
				-	int i;
			
 
				 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
			
 
				 	int dev_replace_is_ongoing;
			
 
				+	int have_zone = 0;
			
 
				 
			
 
				 	spin_lock(&fs_info->reada_lock);
			
 
				 	re = radix_tree_lookup(&fs_info->reada_tree, index);
			
@@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 	if (re)
			
 
				 		return re;
			
 
				 
			
 
				-	re = kzalloc(sizeof(*re), GFP_NOFS);
			
 
				+	re = kzalloc(sizeof(*re), GFP_KERNEL);
			
 
				 	if (!re)
			
 
				 		return NULL;
			
 
				 
			
@@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 		struct reada_zone *zone;
			
 
				 
			
 
				 		dev = bbio->stripes[nzones].dev;
			
 
				+
			
 
				+		/* cannot read ahead on missing device. */
			
 
				+		 if (!dev->bdev)
			
 
				+			continue;
			
 
				+
			
 
				 		zone = reada_find_zone(fs_info, dev, logical, bbio);
			
 
				 		if (!zone)
			
 
				-			break;
			
 
				+			continue;
			
 
				 
			
 
				-		re->zones[nzones] = zone;
			
 
				+		re->zones[re->nzones++] = zone;
			
 
				 		spin_lock(&zone->lock);
			
 
				 		if (!zone->elems)
			
 
				 			kref_get(&zone->refcnt);
			
@@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 		kref_put(&zone->refcnt, reada_zone_release);
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				 	}
			
 
				-	re->nzones = nzones;
			
 
				-	if (nzones == 0) {
			
 
				+	if (re->nzones == 0) {
			
 
				 		/* not a single zone found, error and out */
			
 
				 		goto error;
			
 
				 	}
			
 
				 
			
 
				 	/* insert extent in reada_tree + all per-device trees, all or nothing */
			
 
				-	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
			
 
				 	spin_lock(&fs_info->reada_lock);
			
 
				 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
			
 
				 	if (ret == -EEXIST) {
			
@@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 		BUG_ON(!re_exist);
			
 
				 		re_exist->refcnt++;
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				-		btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 		goto error;
			
 
				 	}
			
 
				 	if (ret) {
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				-		btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 		goto error;
			
 
				 	}
			
 
				 	prev_dev = NULL;
			
 
				 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
			
 
				 			&fs_info->dev_replace);
			
 
				-	for (i = 0; i < nzones; ++i) {
			
 
				-		dev = bbio->stripes[i].dev;
			
 
				+	for (nzones = 0; nzones < re->nzones; ++nzones) {
			
 
				+		dev = re->zones[nzones]->device;
			
 
				+
			
 
				 		if (dev == prev_dev) {
			
 
				 			/*
			
 
				 			 * in case of DUP, just add the first zone. As both
			
@@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 			 */
			
 
				 			continue;
			
 
				 		}
			
 
				-		if (!dev->bdev) {
			
 
				-			/*
			
 
				-			 * cannot read ahead on missing device, but for RAID5/6,
			
 
				-			 * REQ_GET_READ_MIRRORS return 1. So don't skip missing
			
 
				-			 * device for such case.
			
 
				-			 */
			
 
				-			if (nzones > 1)
			
 
				-				continue;
			
 
				-		}
			
 
				+		if (!dev->bdev)
			
 
				+			continue;
			
 
				+
			
 
				 		if (dev_replace_is_ongoing &&
			
 
				 		    dev == fs_info->dev_replace.tgtdev) {
			
 
				 			/*
			
@@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 		prev_dev = dev;
			
 
				 		ret = radix_tree_insert(&dev->reada_extents, index, re);
			
 
				 		if (ret) {
			
 
				-			while (--i >= 0) {
			
 
				-				dev = bbio->stripes[i].dev;
			
 
				+			while (--nzones >= 0) {
			
 
				+				dev = re->zones[nzones]->device;
			
 
				 				BUG_ON(dev == NULL);
			
 
				 				/* ignore whether the entry was inserted */
			
 
				 				radix_tree_delete(&dev->reada_extents, index);
			
@@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 
				 			BUG_ON(fs_info == NULL);
			
 
				 			radix_tree_delete(&fs_info->reada_tree, index);
			
 
				 			spin_unlock(&fs_info->reada_lock);
			
 
				-			btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+			btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 			goto error;
			
 
				 		}
			
 
				+		have_zone = 1;
			
 
				 	}
			
 
				 	spin_unlock(&fs_info->reada_lock);
			
 
				-	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				+
			
 
				+	if (!have_zone)
			
 
				+		goto error;
			
 
				 
			
 
				 	btrfs_put_bbio(bbio);
			
 
				 	return re;
			
 
				 
			
 
				 error:
			
 
				-	while (nzones) {
			
 
				+	for (nzones = 0; nzones < re->nzones; ++nzones) {
			
 
				 		struct reada_zone *zone;
			
 
				 
			
 
				-		--nzones;
			
 
				 		zone = re->zones[nzones];
			
 
				 		kref_get(&zone->refcnt);
			
 
				 		spin_lock(&zone->lock);
			
@@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 
				 		kref_put(&zone->refcnt, reada_zone_release);
			
 
				 		spin_unlock(&fs_info->reada_lock);
			
 
				 	}
			
 
				-	if (re->scheduled_for)
			
 
				-		atomic_dec(&re->scheduled_for->reada_in_flight);
			
 
				 
			
 
				 	kfree(re);
			
 
				 }
			
@@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref)
 
				 }
			
 
				 
			
 
				 static int reada_add_block(struct reada_control *rc, u64 logical,
			
 
				-			   struct btrfs_key *top, int level, u64 generation)
			
 
				+			   struct btrfs_key *top, u64 generation)
			
 
				 {
			
 
				 	struct btrfs_root *root = rc->root;
			
 
				 	struct reada_extent *re;
			
 
				 	struct reada_extctl *rec;
			
 
				 
			
 
				-	re = reada_find_extent(root, logical, top, level); /* takes one ref */
			
 
				+	re = reada_find_extent(root, logical, top); /* takes one ref */
			
 
				 	if (!re)
			
 
				 		return -1;
			
 
				 
			
 
				-	rec = kzalloc(sizeof(*rec), GFP_NOFS);
			
 
				+	rec = kzalloc(sizeof(*rec), GFP_KERNEL);
			
 
				 	if (!rec) {
			
 
				 		reada_extent_put(root->fs_info, re);
			
 
				 		return -ENOMEM;
			
@@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 
				 	u64 logical;
			
 
				 	int ret;
			
 
				 	int i;
			
 
				-	int need_kick = 0;
			
 
				 
			
 
				 	spin_lock(&fs_info->reada_lock);
			
 
				 	if (dev->reada_curr_zone == NULL) {
			
@@ -679,7 +674,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 
				 	 */
			
 
				 	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
			
 
				 				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
			
 
				-	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
			
 
				+	if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
			
 
				 		ret = reada_pick_zone(dev);
			
 
				 		if (!ret) {
			
 
				 			spin_unlock(&fs_info->reada_lock);
			
@@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 
				 
			
 
				 	spin_unlock(&fs_info->reada_lock);
			
 
				 
			
 
				+	spin_lock(&re->lock);
			
 
				+	if (re->scheduled || list_empty(&re->extctl)) {
			
 
				+		spin_unlock(&re->lock);
			
 
				+		reada_extent_put(fs_info, re);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	re->scheduled = 1;
			
 
				+	spin_unlock(&re->lock);
			
 
				+
			
 
				 	/*
			
 
				 	 * find mirror num
			
 
				 	 */
			
@@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 
				 	}
			
 
				 	logical = re->logical;
			
 
				 
			
 
				-	spin_lock(&re->lock);
			
 
				-	if (re->scheduled_for == NULL) {
			
 
				-		re->scheduled_for = dev;
			
 
				-		need_kick = 1;
			
 
				-	}
			
 
				-	spin_unlock(&re->lock);
			
 
				-
			
 
				-	reada_extent_put(fs_info, re);
			
 
				-
			
 
				-	if (!need_kick)
			
 
				-		return 0;
			
 
				-
			
 
				 	atomic_inc(&dev->reada_in_flight);
			
 
				 	ret = reada_tree_block_flagged(fs_info->extent_root, logical,
			
 
				 			mirror_num, &eb);
			
 
				 	if (ret)
			
 
				-		__readahead_hook(fs_info->extent_root, NULL, logical, ret);
			
 
				+		__readahead_hook(fs_info, re, NULL, logical, ret);
			
 
				 	else if (eb)
			
 
				-		__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
			
 
				+		__readahead_hook(fs_info, re, eb, eb->start, ret);
			
 
				 
			
 
				 	if (eb)
			
 
				 		free_extent_buffer(eb);
			
 
				 
			
 
				+	atomic_dec(&dev->reada_in_flight);
			
 
				+	reada_extent_put(fs_info, re);
			
 
				+
			
 
				 	return 1;
			
 
				 
			
 
				 }
			
@@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work)
 
				 	set_task_ioprio(current, BTRFS_IOPRIO_READA);
			
 
				 	__reada_start_machine(fs_info);
			
 
				 	set_task_ioprio(current, old_ioprio);
			
 
				+
			
 
				+	atomic_dec(&fs_info->reada_works_cnt);
			
 
				 }
			
 
				 
			
 
				 static void __reada_start_machine(struct btrfs_fs_info *fs_info)
			
@@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
 
				 	 * enqueue to workers to finish it. This will distribute the load to
			
 
				 	 * the cores.
			
 
				 	 */
			
 
				-	for (i = 0; i < 2; ++i)
			
 
				+	for (i = 0; i < 2; ++i) {
			
 
				 		reada_start_machine(fs_info);
			
 
				+		if (atomic_read(&fs_info->reada_works_cnt) >
			
 
				+		    BTRFS_MAX_MIRRORS * 2)
			
 
				+			break;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void reada_start_machine(struct btrfs_fs_info *fs_info)
			
 
				 {
			
 
				 	struct reada_machine_work *rmw;
			
 
				 
			
 
				-	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
			
 
				+	rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
			
 
				 	if (!rmw) {
			
 
				 		/* FIXME we cannot handle this properly right now */
			
 
				 		BUG();
			
@@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
 
				 	rmw->fs_info = fs_info;
			
 
				 
			
 
				 	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
			
 
				+	atomic_inc(&fs_info->reada_works_cnt);
			
 
				 }
			
 
				 
			
 
				 #ifdef DEBUG
			
@@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 
				 			if (ret == 0)
			
 
				 				break;
			
 
				 			printk(KERN_DEBUG
			
 
				-				"  re: logical %llu size %u empty %d for %lld",
			
 
				+				"  re: logical %llu size %u empty %d scheduled %d",
			
 
				 				re->logical, fs_info->tree_root->nodesize,
			
 
				-				list_empty(&re->extctl), re->scheduled_for ?
			
 
				-				re->scheduled_for->devid : -1);
			
 
				+				list_empty(&re->extctl), re->scheduled);
			
 
				 
			
 
				 			for (i = 0; i < re->nzones; ++i) {
			
 
				 				printk(KERN_CONT " zone %llu-%llu devs",
			
@@ -878,27 +879,21 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 
				 					     index, 1);
			
 
				 		if (ret == 0)
			
 
				 			break;
			
 
				-		if (!re->scheduled_for) {
			
 
				+		if (!re->scheduled) {
			
 
				 			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
			
 
				 			continue;
			
 
				 		}
			
 
				 		printk(KERN_DEBUG
			
 
				-			"re: logical %llu size %u list empty %d for %lld",
			
 
				+			"re: logical %llu size %u list empty %d scheduled %d",
			
 
				 			re->logical, fs_info->tree_root->nodesize,
			
 
				-			list_empty(&re->extctl),
			
 
				-			re->scheduled_for ? re->scheduled_for->devid : -1);
			
 
				+			list_empty(&re->extctl), re->scheduled);
			
 
				 		for (i = 0; i < re->nzones; ++i) {
			
 
				 			printk(KERN_CONT " zone %llu-%llu devs",
			
 
				 				re->zones[i]->start,
			
 
				 				re->zones[i]->end);
			
 
				-			for (i = 0; i < re->nzones; ++i) {
			
 
				-				printk(KERN_CONT " zone %llu-%llu devs",
			
 
				-					re->zones[i]->start,
			
 
				-					re->zones[i]->end);
			
 
				-				for (j = 0; j < re->zones[i]->ndevs; ++j) {
			
 
				-					printk(KERN_CONT " %lld",
			
 
				-						re->zones[i]->devs[j]->devid);
			
 
				-				}
			
 
				+			for (j = 0; j < re->zones[i]->ndevs; ++j) {
			
 
				+				printk(KERN_CONT " %lld",
			
 
				+				       re->zones[i]->devs[j]->devid);
			
 
				 			}
			
 
				 		}
			
 
				 		printk(KERN_CONT "\n");
			
@@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 
				 	struct reada_control *rc;
			
 
				 	u64 start;
			
 
				 	u64 generation;
			
 
				-	int level;
			
 
				 	int ret;
			
 
				 	struct extent_buffer *node;
			
 
				 	static struct btrfs_key max_key = {
			
@@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 
				 		.offset = (u64)-1
			
 
				 	};
			
 
				 
			
 
				-	rc = kzalloc(sizeof(*rc), GFP_NOFS);
			
 
				+	rc = kzalloc(sizeof(*rc), GFP_KERNEL);
			
 
				 	if (!rc)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
@@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 
				 
			
 
				 	node = btrfs_root_node(root);
			
 
				 	start = node->start;
			
 
				-	level = btrfs_header_level(node);
			
 
				 	generation = btrfs_header_generation(node);
			
 
				 	free_extent_buffer(node);
			
 
				 
			
 
				-	ret = reada_add_block(rc, start, &max_key, level, generation);
			
 
				+	ret = reada_add_block(rc, start, &max_key, generation);
			
 
				 	if (ret) {
			
 
				 		kfree(rc);
			
 
				 		return ERR_PTR(ret);
			
@@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 
				 int btrfs_reada_wait(void *handle)
			
 
				 {
			
 
				 	struct reada_control *rc = handle;
			
 
				+	struct btrfs_fs_info *fs_info = rc->root->fs_info;
			
 
				 
			
 
				 	while (atomic_read(&rc->elems)) {
			
 
				+		if (!atomic_read(&fs_info->reada_works_cnt))
			
 
				+			reada_start_machine(fs_info);
			
 
				 		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
			
 
				 				   5 * HZ);
			
 
				 		dump_devs(rc->root->fs_info,
			
@@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle)
 
				 int btrfs_reada_wait(void *handle)
			
 
				 {
			
 
				 	struct reada_control *rc = handle;
			
 
				+	struct btrfs_fs_info *fs_info = rc->root->fs_info;
			
 
				 
			
 
				 	while (atomic_read(&rc->elems)) {
			
 
				-		wait_event(rc->wait, atomic_read(&rc->elems) == 0);
			
 
				+		if (!atomic_read(&fs_info->reada_works_cnt))
			
 
				+			reada_start_machine(fs_info);
			
 
				+		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
			
 
				+				   (HZ + 9) / 10);
			
 
				 	}
			
 
				 
			
 
				 	kref_put(&rc->refcnt, reada_control_release);
			
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -496,7 +496,7 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 
				 			     struct btrfs_root *root)
			
 
				 {
			
 
				 	struct btrfs_root_item *item = &root->root_item;
			
 
				-	struct timespec ct = CURRENT_TIME;
			
 
				+	struct timespec ct = current_fs_time(root->fs_info->sb);
			
 
				 
			
 
				 	spin_lock(&root->root_item_lock);
			
 
				 	btrfs_set_root_ctransid(item, trans->transid);
			
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -461,7 +461,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 
				 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
			
 
				 	int ret;
			
 
				 
			
 
				-	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
			
 
				+	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
			
 
				 	if (!sctx)
			
 
				 		goto nomem;
			
 
				 	atomic_set(&sctx->refs, 1);
			
@@ -472,7 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 
				 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
			
 
				 		struct scrub_bio *sbio;
			
 
				 
			
 
				-		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
			
 
				+		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
			
 
				 		if (!sbio)
			
 
				 			goto nomem;
			
 
				 		sctx->bios[i] = sbio;
			
@@ -611,7 +611,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 
				 	u64 flags = 0;
			
 
				 	u64 ref_root;
			
 
				 	u32 item_size;
			
 
				-	u8 ref_level;
			
 
				+	u8 ref_level = 0;
			
 
				 	int ret;
			
 
				 
			
 
				 	WARN_ON(sblock->page_count < 1);
			
@@ -1654,7 +1654,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 
				 again:
			
 
				 	if (!wr_ctx->wr_curr_bio) {
			
 
				 		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
			
 
				-					      GFP_NOFS);
			
 
				+					      GFP_KERNEL);
			
 
				 		if (!wr_ctx->wr_curr_bio) {
			
 
				 			mutex_unlock(&wr_ctx->wr_lock);
			
 
				 			return -ENOMEM;
			
@@ -1671,7 +1671,8 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 
				 		sbio->dev = wr_ctx->tgtdev;
			
 
				 		bio = sbio->bio;
			
 
				 		if (!bio) {
			
 
				-			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
			
 
				+			bio = btrfs_io_bio_alloc(GFP_KERNEL,
			
 
				+					wr_ctx->pages_per_wr_bio);
			
 
				 			if (!bio) {
			
 
				 				mutex_unlock(&wr_ctx->wr_lock);
			
 
				 				return -ENOMEM;
			
@@ -2076,7 +2077,8 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 
				 		sbio->dev = spage->dev;
			
 
				 		bio = sbio->bio;
			
 
				 		if (!bio) {
			
 
				-			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
			
 
				+			bio = btrfs_io_bio_alloc(GFP_KERNEL,
			
 
				+					sctx->pages_per_rd_bio);
			
 
				 			if (!bio)
			
 
				 				return -ENOMEM;
			
 
				 			sbio->bio = bio;
			
@@ -2241,7 +2243,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 
				 	struct scrub_block *sblock;
			
 
				 	int index;
			
 
				 
			
 
				-	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
			
 
				+	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
			
 
				 	if (!sblock) {
			
 
				 		spin_lock(&sctx->stat_lock);
			
 
				 		sctx->stat.malloc_errors++;
			
@@ -2259,7 +2261,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 
				 		struct scrub_page *spage;
			
 
				 		u64 l = min_t(u64, len, PAGE_SIZE);
			
 
				 
			
 
				-		spage = kzalloc(sizeof(*spage), GFP_NOFS);
			
 
				+		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
			
 
				 		if (!spage) {
			
 
				 leave_nomem:
			
 
				 			spin_lock(&sctx->stat_lock);
			
@@ -2286,7 +2288,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 
				 			spage->have_csum = 0;
			
 
				 		}
			
 
				 		sblock->page_count++;
			
 
				-		spage->page = alloc_page(GFP_NOFS);
			
 
				+		spage->page = alloc_page(GFP_KERNEL);
			
 
				 		if (!spage->page)
			
 
				 			goto leave_nomem;
			
 
				 		len -= l;
			
@@ -2541,7 +2543,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
 
				 	struct scrub_block *sblock;
			
 
				 	int index;
			
 
				 
			
 
				-	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
			
 
				+	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
			
 
				 	if (!sblock) {
			
 
				 		spin_lock(&sctx->stat_lock);
			
 
				 		sctx->stat.malloc_errors++;
			
@@ -2561,7 +2563,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
 
				 		struct scrub_page *spage;
			
 
				 		u64 l = min_t(u64, len, PAGE_SIZE);
			
 
				 
			
 
				-		spage = kzalloc(sizeof(*spage), GFP_NOFS);
			
 
				+		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
			
 
				 		if (!spage) {
			
 
				 leave_nomem:
			
 
				 			spin_lock(&sctx->stat_lock);
			
@@ -2591,7 +2593,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
 
				 			spage->have_csum = 0;
			
 
				 		}
			
 
				 		sblock->page_count++;
			
 
				-		spage->page = alloc_page(GFP_NOFS);
			
 
				+		spage->page = alloc_page(GFP_KERNEL);
			
 
				 		if (!spage->page)
			
 
				 			goto leave_nomem;
			
 
				 		len -= l;
			
@@ -3857,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
				 		return -EIO;
			
 
				 	}
			
 
				 
			
 
				-	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
			
 
				 	if (dev->scrub_device ||
			
 
				 	    (!is_dev_replace &&
			
 
				 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
			
 
				-		btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 		mutex_unlock(&fs_info->scrub_lock);
			
 
				 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			
 
				 		return -EINPROGRESS;
			
 
				 	}
			
 
				-	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 
			
 
				 	ret = scrub_workers_get(fs_info, is_dev_replace);
			
 
				 	if (ret) {
			
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -34,6 +34,7 @@
 
				 #include "disk-io.h"
			
 
				 #include "btrfs_inode.h"
			
 
				 #include "transaction.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 static int g_verbose = 0;
			
 
				 
			
@@ -304,7 +305,7 @@ static struct fs_path *fs_path_alloc(void)
 
				 {
			
 
				 	struct fs_path *p;
			
 
				 
			
 
				-	p = kmalloc(sizeof(*p), GFP_NOFS);
			
 
				+	p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				 	if (!p)
			
 
				 		return NULL;
			
 
				 	p->reversed = 0;
			
@@ -363,11 +364,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 
				 	 * First time the inline_buf does not suffice
			
 
				 	 */
			
 
				 	if (p->buf == p->inline_buf) {
			
 
				-		tmp_buf = kmalloc(len, GFP_NOFS);
			
 
				+		tmp_buf = kmalloc(len, GFP_KERNEL);
			
 
				 		if (tmp_buf)
			
 
				 			memcpy(tmp_buf, p->buf, old_buf_len);
			
 
				 	} else {
			
 
				-		tmp_buf = krealloc(p->buf, len, GFP_NOFS);
			
 
				+		tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
			
 
				 	}
			
 
				 	if (!tmp_buf)
			
 
				 		return -ENOMEM;
			
@@ -995,7 +996,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 
				 	 * values are small.
			
 
				 	 */
			
 
				 	buf_len = PATH_MAX;
			
 
				-	buf = kmalloc(buf_len, GFP_NOFS);
			
 
				+	buf = kmalloc(buf_len, GFP_KERNEL);
			
 
				 	if (!buf) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
@@ -1042,7 +1043,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 
				 				buf = NULL;
			
 
				 			} else {
			
 
				 				char *tmp = krealloc(buf, buf_len,
			
 
				-						     GFP_NOFS | __GFP_NOWARN);
			
 
				+						GFP_KERNEL | __GFP_NOWARN);
			
 
				 
			
 
				 				if (!tmp)
			
 
				 					kfree(buf);
			
@@ -1303,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 
				 	/* We only use this path under the commit sem */
			
 
				 	tmp_path->need_commit_sem = 0;
			
 
				 
			
 
				-	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
			
 
				+	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
			
 
				 	if (!backref_ctx) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
@@ -1984,7 +1985,7 @@ static int name_cache_insert(struct send_ctx *sctx,
 
				 	nce_head = radix_tree_lookup(&sctx->name_cache,
			
 
				 			(unsigned long)nce->ino);
			
 
				 	if (!nce_head) {
			
 
				-		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
			
 
				+		nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
			
 
				 		if (!nce_head) {
			
 
				 			kfree(nce);
			
 
				 			return -ENOMEM;
			
@@ -2179,7 +2180,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 
				 	/*
			
 
				 	 * Store the result of the lookup in the name cache.
			
 
				 	 */
			
 
				-	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
			
 
				+	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
			
 
				 	if (!nce) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
@@ -2315,7 +2316,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
			
 
				+	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
			
 
				 	if (!name) {
			
 
				 		btrfs_free_path(path);
			
 
				 		return -ENOMEM;
			
@@ -2730,7 +2731,7 @@ static int __record_ref(struct list_head *head, u64 dir,
 
				 {
			
 
				 	struct recorded_ref *ref;
			
 
				 
			
 
				-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
			
 
				+	ref = kmalloc(sizeof(*ref), GFP_KERNEL);
			
 
				 	if (!ref)
			
 
				 		return -ENOMEM;
			
 
				 
			
@@ -2755,7 +2756,7 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list)
 
				 {
			
 
				 	struct recorded_ref *new;
			
 
				 
			
 
				-	new = kmalloc(sizeof(*ref), GFP_NOFS);
			
 
				+	new = kmalloc(sizeof(*ref), GFP_KERNEL);
			
 
				 	if (!new)
			
 
				 		return -ENOMEM;
			
 
				 
			
@@ -2818,7 +2819,7 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
 
				 	struct rb_node *parent = NULL;
			
 
				 	struct orphan_dir_info *entry, *odi;
			
 
				 
			
 
				-	odi = kmalloc(sizeof(*odi), GFP_NOFS);
			
 
				+	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
			
 
				 	if (!odi)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 	odi->ino = dir_ino;
			
@@ -2973,7 +2974,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
 
				 	struct rb_node *parent = NULL;
			
 
				 	struct waiting_dir_move *entry, *dm;
			
 
				 
			
 
				-	dm = kmalloc(sizeof(*dm), GFP_NOFS);
			
 
				+	dm = kmalloc(sizeof(*dm), GFP_KERNEL);
			
 
				 	if (!dm)
			
 
				 		return -ENOMEM;
			
 
				 	dm->ino = ino;
			
@@ -3040,7 +3041,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 
				 	int exists = 0;
			
 
				 	int ret;
			
 
				 
			
 
				-	pm = kmalloc(sizeof(*pm), GFP_NOFS);
			
 
				+	pm = kmalloc(sizeof(*pm), GFP_KERNEL);
			
 
				 	if (!pm)
			
 
				 		return -ENOMEM;
			
 
				 	pm->parent_ino = parent_ino;
			
@@ -4280,7 +4281,7 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
 
				 	    strncmp(name, ctx->name, name_len) == 0) {
			
 
				 		ctx->found_idx = num;
			
 
				 		ctx->found_data_len = data_len;
			
 
				-		ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
			
 
				+		ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
			
 
				 		if (!ctx->found_data)
			
 
				 			return -ENOMEM;
			
 
				 		return 1;
			
@@ -4481,7 +4482,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 
				 	while (index <= last_index) {
			
 
				 		unsigned cur_len = min_t(unsigned, len,
			
 
				 					 PAGE_CACHE_SIZE - pg_offset);
			
 
				-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
			
 
				+		page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
			
 
				 		if (!page) {
			
 
				 			ret = -ENOMEM;
			
 
				 			break;
			
@@ -5989,7 +5990,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
			
 
				+	sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
			
 
				 	if (!sctx) {
			
 
				 		ret = -ENOMEM;
			
 
				 		goto out;
			
@@ -5997,7 +5998,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 
				 
			
 
				 	INIT_LIST_HEAD(&sctx->new_refs);
			
 
				 	INIT_LIST_HEAD(&sctx->deleted_refs);
			
 
				-	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
			
 
				+	INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
			
 
				 	INIT_LIST_HEAD(&sctx->name_cache_list);
			
 
				 
			
 
				 	sctx->flags = arg->flags;
			
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,7 +303,8 @@ enum {
 
				 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
			
 
				 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
			
 
				 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
			
 
				-	Opt_datasum, Opt_treelog, Opt_noinode_cache,
			
 
				+	Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
			
 
				+	Opt_nologreplay, Opt_norecovery,
			
 
				 #ifdef CONFIG_BTRFS_DEBUG
			
 
				 	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
			
 
				 #endif
			
@@ -335,6 +336,8 @@ static const match_table_t tokens = {
 
				 	{Opt_noacl, "noacl"},
			
 
				 	{Opt_notreelog, "notreelog"},
			
 
				 	{Opt_treelog, "treelog"},
			
 
				+	{Opt_nologreplay, "nologreplay"},
			
 
				+	{Opt_norecovery, "norecovery"},
			
 
				 	{Opt_flushoncommit, "flushoncommit"},
			
 
				 	{Opt_noflushoncommit, "noflushoncommit"},
			
 
				 	{Opt_ratio, "metadata_ratio=%d"},
			
@@ -352,7 +355,8 @@ static const match_table_t tokens = {
 
				 	{Opt_inode_cache, "inode_cache"},
			
 
				 	{Opt_noinode_cache, "noinode_cache"},
			
 
				 	{Opt_no_space_cache, "nospace_cache"},
			
 
				-	{Opt_recovery, "recovery"},
			
 
				+	{Opt_recovery, "recovery"}, /* deprecated */
			
 
				+	{Opt_usebackuproot, "usebackuproot"},
			
 
				 	{Opt_skip_balance, "skip_balance"},
			
 
				 	{Opt_check_integrity, "check_int"},
			
 
				 	{Opt_check_integrity_including_extent_data, "check_int_data"},
			
@@ -373,7 +377,8 @@ static const match_table_t tokens = {
 
				  * reading in a new superblock is parsed here.
			
 
				  * XXX JDM: This needs to be cleaned up for remount.
			
 
				  */
			
 
				-int btrfs_parse_options(struct btrfs_root *root, char *options)
			
 
				+int btrfs_parse_options(struct btrfs_root *root, char *options,
			
 
				+			unsigned long new_flags)
			
 
				 {
			
 
				 	struct btrfs_fs_info *info = root->fs_info;
			
 
				 	substring_t args[MAX_OPT_ARGS];
			
@@ -393,8 +398,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 	else if (cache_gen)
			
 
				 		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
			
 
				 
			
 
				+	/*
			
 
				+	 * Even the options are empty, we still need to do extra check
			
 
				+	 * against new flags
			
 
				+	 */
			
 
				 	if (!options)
			
 
				-		goto out;
			
 
				+		goto check;
			
 
				 
			
 
				 	/*
			
 
				 	 * strsep changes the string, duplicate it because parse_options
			
@@ -606,6 +615,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 			btrfs_clear_and_info(root, NOTREELOG,
			
 
				 					     "enabling tree log");
			
 
				 			break;
			
 
				+		case Opt_norecovery:
			
 
				+		case Opt_nologreplay:
			
 
				+			btrfs_set_and_info(root, NOLOGREPLAY,
			
 
				+					   "disabling log replay at mount time");
			
 
				+			break;
			
 
				 		case Opt_flushoncommit:
			
 
				 			btrfs_set_and_info(root, FLUSHONCOMMIT,
			
 
				 					   "turning on flush-on-commit");
			
@@ -696,8 +710,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 					     "disabling auto defrag");
			
 
				 			break;
			
 
				 		case Opt_recovery:
			
 
				-			btrfs_info(root->fs_info, "enabling auto recovery");
			
 
				-			btrfs_set_opt(info->mount_opt, RECOVERY);
			
 
				+			btrfs_warn(root->fs_info,
			
 
				+				   "'recovery' is deprecated, use 'usebackuproot' instead");
			
 
				+		case Opt_usebackuproot:
			
 
				+			btrfs_info(root->fs_info,
			
 
				+				   "trying to use backup root at mount time");
			
 
				+			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
			
 
				 			break;
			
 
				 		case Opt_skip_balance:
			
 
				 			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
			
@@ -792,6 +810,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
				 			break;
			
 
				 		}
			
 
				 	}
			
 
				+check:
			
 
				+	/*
			
 
				+	 * Extra check for current option against current flag
			
 
				+	 */
			
 
				+	if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
			
 
				+		btrfs_err(root->fs_info,
			
 
				+			  "nologreplay must be used with ro mount option");
			
 
				+		ret = -EINVAL;
			
 
				+	}
			
 
				 out:
			
 
				 	if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
			
 
				 	    !btrfs_test_opt(root, FREE_SPACE_TREE) &&
			
@@ -1202,6 +1229,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 
				 		seq_puts(seq, ",ssd");
			
 
				 	if (btrfs_test_opt(root, NOTREELOG))
			
 
				 		seq_puts(seq, ",notreelog");
			
 
				+	if (btrfs_test_opt(root, NOLOGREPLAY))
			
 
				+		seq_puts(seq, ",nologreplay");
			
 
				 	if (btrfs_test_opt(root, FLUSHONCOMMIT))
			
 
				 		seq_puts(seq, ",flushoncommit");
			
 
				 	if (btrfs_test_opt(root, DISCARD))
			
@@ -1228,8 +1257,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 
				 		seq_puts(seq, ",inode_cache");
			
 
				 	if (btrfs_test_opt(root, SKIP_BALANCE))
			
 
				 		seq_puts(seq, ",skip_balance");
			
 
				-	if (btrfs_test_opt(root, RECOVERY))
			
 
				-		seq_puts(seq, ",recovery");
			
 
				 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
			
 
				 	if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
			
 
				 		seq_puts(seq, ",check_int_data");
			
@@ -1685,7 +1712,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	ret = btrfs_parse_options(root, data);
			
 
				+	ret = btrfs_parse_options(root, data, *flags);
			
 
				 	if (ret) {
			
 
				 		ret = -EINVAL;
			
 
				 		goto restore;
			
@@ -2163,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
				 			break;
			
 
				 		ret = !(fs_devices->num_devices == fs_devices->total_devices);
			
 
				 		break;
			
 
				+	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
			
 
				+		ret = btrfs_ioctl_get_supported_features((void __user*)arg);
			
 
				+		break;
			
 
				 	}
			
 
				 
			
 
				 	kfree(vol);
			
@@ -2261,7 +2291,7 @@ static void btrfs_interface_exit(void)
 
				 	misc_deregister(&btrfs_misc);
			
 
				 }
			
 
				 
			
 
				-static void btrfs_print_info(void)
			
 
				+static void btrfs_print_mod_info(void)
			
 
				 {
			
 
				 	printk(KERN_INFO "Btrfs loaded"
			
 
				 #ifdef CONFIG_BTRFS_DEBUG
			
@@ -2363,7 +2393,7 @@ static int __init init_btrfs_fs(void)
 
				 
			
 
				 	btrfs_init_lockdep();
			
 
				 
			
 
				-	btrfs_print_info();
			
 
				+	btrfs_print_mod_info();
			
 
				 
			
 
				 	err = btrfs_run_sanity_tests();
			
 
				 	if (err)
			
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -188,12 +188,6 @@ btrfs_alloc_dummy_block_group(unsigned long length)
 
				 		kfree(cache);
			
 
				 		return NULL;
			
 
				 	}
			
 
				-	cache->fs_info = btrfs_alloc_dummy_fs_info();
			
 
				-	if (!cache->fs_info) {
			
 
				-		kfree(cache->free_space_ctl);
			
 
				-		kfree(cache);
			
 
				-		return NULL;
			
 
				-	}
			
 
				 
			
 
				 	cache->key.objectid = 0;
			
 
				 	cache->key.offset = length;
			
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -485,6 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps)
 
				 	cache->bitmap_low_thresh = 0;
			
 
				 	cache->bitmap_high_thresh = (u32)-1;
			
 
				 	cache->needs_free_space = 1;
			
 
				+	cache->fs_info = root->fs_info;
			
 
				 
			
 
				 	btrfs_init_dummy_trans(&trans);
			
 
				 
			
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -22,6 +22,7 @@
 
				 #include "../disk-io.h"
			
 
				 #include "../extent_io.h"
			
 
				 #include "../volumes.h"
			
 
				+#include "../compression.h"
			
 
				 
			
 
				 static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
			
 
				 			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
			
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -637,6 +637,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
 
				 
			
 
				 	trans->block_rsv = &root->fs_info->trans_block_rsv;
			
 
				 	trans->bytes_reserved = num_bytes;
			
 
				+	trace_btrfs_space_reservation(root->fs_info, "transaction",
			
 
				+				      trans->transid, num_bytes, 1);
			
 
				 
			
 
				 	return trans;
			
 
				 }
			
@@ -1333,7 +1335,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 	struct dentry *dentry;
			
 
				 	struct extent_buffer *tmp;
			
 
				 	struct extent_buffer *old;
			
 
				-	struct timespec cur_time = CURRENT_TIME;
			
 
				+	struct timespec cur_time;
			
 
				 	int ret = 0;
			
 
				 	u64 to_reserve = 0;
			
 
				 	u64 index = 0;
			
@@ -1375,12 +1377,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 	rsv = trans->block_rsv;
			
 
				 	trans->block_rsv = &pending->block_rsv;
			
 
				 	trans->bytes_reserved = trans->block_rsv->reserved;
			
 
				-
			
 
				+	trace_btrfs_space_reservation(root->fs_info, "transaction",
			
 
				+				      trans->transid,
			
 
				+				      trans->bytes_reserved, 1);
			
 
				 	dentry = pending->dentry;
			
 
				 	parent_inode = pending->dir;
			
 
				 	parent_root = BTRFS_I(parent_inode)->root;
			
 
				 	record_root_in_trans(trans, parent_root);
			
 
				 
			
 
				+	cur_time = current_fs_time(parent_inode->i_sb);
			
 
				+
			
 
				 	/*
			
 
				 	 * insert the directory item
			
 
				 	 */
			
@@ -1523,7 +1529,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
			
 
				 					 dentry->d_name.len * 2);
			
 
				-	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
			
 
				+	parent_inode->i_mtime = parent_inode->i_ctime =
			
 
				+		current_fs_time(parent_inode->i_sb);
			
 
				 	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
			
 
				 	if (ret) {
			
 
				 		btrfs_abort_transaction(trans, root, ret);
			
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
 
				 #include "print-tree.h"
			
 
				 #include "backref.h"
			
 
				 #include "hash.h"
			
 
				+#include "compression.h"
			
 
				 
			
 
				 /* magic values for the inode_only field in btrfs_log_inode:
			
 
				  *
			
@@ -1045,7 +1046,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 
				 
			
 
				 		/*
			
 
				 		 * NOTE: we have searched root tree and checked the
			
 
				-		 * coresponding ref, it does not need to check again.
			
 
				+		 * corresponding ref, it does not need to check again.
			
 
				 		 */
			
 
				 		*search_done = 1;
			
 
				 	}
			
@@ -4500,7 +4501,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	mutex_lock(&BTRFS_I(inode)->log_mutex);
			
 
				 
			
 
				-	btrfs_get_logged_extents(inode, &logged_list, start, end);
			
 
				+	/*
			
 
				+	 * Collect ordered extents only if we are logging data. This is to
			
 
				+	 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
			
 
				+	 * will process the ordered extents if they still exists at the time,
			
 
				+	 * because when we collect them we test and set for the flag
			
 
				+	 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
			
 
				+	 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
			
 
				+	 * not processing the ordered extents is that we end up logging the
			
 
				+	 * corresponding file extent items, based on the extent maps in the
			
 
				+	 * inode's extent_map_tree's modified_list, without logging the
			
 
				+	 * respective checksums (since the may still be only attached to the
			
 
				+	 * ordered extents and have not been inserted in the csum tree by
			
 
				+	 * btrfs_finish_ordered_io() yet).
			
 
				+	 */
			
 
				+	if (inode_only == LOG_INODE_ALL)
			
 
				+		btrfs_get_logged_extents(inode, &logged_list, start, end);
			
 
				 
			
 
				 	/*
			
 
				 	 * a brute force approach to making sure we get the most uptodate
			
@@ -4771,6 +4787,42 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Check if we must fallback to a transaction commit when logging an inode.
			
 
				+ * This must be called after logging the inode and is used only in the context
			
 
				+ * when fsyncing an inode requires the need to log some other inode - in which
			
 
				+ * case we can't lock the i_mutex of each other inode we need to log as that
			
 
				+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
			
 
				+ * log inodes up or down in the hierarchy) or rename operations for example. So
			
 
				+ * we take the log_mutex of the inode after we have logged it and then check for
			
 
				+ * its last_unlink_trans value - this is safe because any task setting
			
 
				+ * last_unlink_trans must take the log_mutex and it must do this before it does
			
 
				+ * the actual unlink operation, so if we do this check before a concurrent task
			
 
				+ * sets last_unlink_trans it means we've logged a consistent version/state of
			
 
				+ * all the inode items, otherwise we are not sure and must do a transaction
			
 
				+ * commit (the concurrent task migth have only updated last_unlink_trans before
			
 
				+ * we logged the inode or it might have also done the unlink).
			
 
				+ */
			
 
				+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
			
 
				+					  struct inode *inode)
			
 
				+{
			
 
				+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
			
 
				+	bool ret = false;
			
 
				+
			
 
				+	mutex_lock(&BTRFS_I(inode)->log_mutex);
			
 
				+	if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
			
 
				+		/*
			
 
				+		 * Make sure any commits to the log are forced to be full
			
 
				+		 * commits.
			
 
				+		 */
			
 
				+		btrfs_set_log_full_commit(fs_info, trans);
			
 
				+		ret = true;
			
 
				+	}
			
 
				+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * follow the dentry parent pointers up the chain and see if any
			
 
				  * of the directories in it require a full commit before they can
			
@@ -4784,7 +4836,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 
				 					       u64 last_committed)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				-	struct btrfs_root *root;
			
 
				 	struct dentry *old_parent = NULL;
			
 
				 	struct inode *orig_inode = inode;
			
 
				 
			
@@ -4816,14 +4867,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 
				 			BTRFS_I(inode)->logged_trans = trans->transid;
			
 
				 		smp_mb();
			
 
				 
			
 
				-		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
			
 
				-			root = BTRFS_I(inode)->root;
			
 
				-
			
 
				-			/*
			
 
				-			 * make sure any commits to the log are forced
			
 
				-			 * to be full commits
			
 
				-			 */
			
 
				-			btrfs_set_log_full_commit(root->fs_info, trans);
			
 
				+		if (btrfs_must_commit_transaction(trans, inode)) {
			
 
				 			ret = 1;
			
 
				 			break;
			
 
				 		}
			
@@ -4982,6 +5026,9 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 
				 			btrfs_release_path(path);
			
 
				 			ret = btrfs_log_inode(trans, root, di_inode,
			
 
				 					      log_mode, 0, LLONG_MAX, ctx);
			
 
				+			if (!ret &&
			
 
				+			    btrfs_must_commit_transaction(trans, di_inode))
			
 
				+				ret = 1;
			
 
				 			iput(di_inode);
			
 
				 			if (ret)
			
 
				 				goto next_dir_inode;
			
@@ -5096,6 +5143,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 
				 
			
 
				 			ret = btrfs_log_inode(trans, root, dir_inode,
			
 
				 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
			
 
				+			if (!ret &&
			
 
				+			    btrfs_must_commit_transaction(trans, dir_inode))
			
 
				+				ret = 1;
			
 
				 			iput(dir_inode);
			
 
				 			if (ret)
			
 
				 				goto out;
			
@@ -5447,6 +5497,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
				  * They revolve around files there were unlinked from the directory, and
			
 
				  * this function updates the parent directory so that a full commit is
			
 
				  * properly done if it is fsync'd later after the unlinks are done.
			
 
				+ *
			
 
				+ * Must be called before the unlink operations (updates to the subvolume tree,
			
 
				+ * inodes, etc) are done.
			
 
				  */
			
 
				 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
			
 
				 			     struct inode *dir, struct inode *inode,
			
@@ -5462,8 +5515,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 
				 	 * into the file.  When the file is logged we check it and
			
 
				 	 * don't log the parents if the file is fully on disk.
			
 
				 	 */
			
 
				-	if (S_ISREG(inode->i_mode))
			
 
				+	if (S_ISREG(inode->i_mode)) {
			
 
				+		mutex_lock(&BTRFS_I(inode)->log_mutex);
			
 
				 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
			
 
				+		mutex_unlock(&BTRFS_I(inode)->log_mutex);
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * if this directory was already logged any new
			
@@ -5494,7 +5550,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 
				 	return;
			
 
				 
			
 
				 record:
			
 
				+	mutex_lock(&BTRFS_I(dir)->log_mutex);
			
 
				+	BTRFS_I(dir)->last_unlink_trans = trans->transid;
			
 
				+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Make sure that if someone attempts to fsync the parent directory of a deleted
			
 
				+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
			
 
				+ * that after replaying the log tree of the parent directory's root we will not
			
 
				+ * see the snapshot anymore and at log replay time we will not see any log tree
			
 
				+ * corresponding to the deleted snapshot's root, which could lead to replaying
			
 
				+ * it after replaying the log tree of the parent directory (which would replay
			
 
				+ * the snapshot delete operation).
			
 
				+ *
			
 
				+ * Must be called before the actual snapshot destroy operation (updates to the
			
 
				+ * parent root and tree of tree roots trees, etc) are done.
			
 
				+ */
			
 
				+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
			
 
				+				   struct inode *dir)
			
 
				+{
			
 
				+	mutex_lock(&BTRFS_I(dir)->log_mutex);
			
 
				 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
			
 
				+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
 
				 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
			
 
				 			     struct inode *dir, struct inode *inode,
			
 
				 			     int for_rename);
			
 
				+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
			
 
				+				   struct inode *dir);
			
 
				 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
			
 
				 			struct inode *inode, struct inode *old_dir,
			
 
				 			struct dentry *parent);
			
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,7 +138,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
 
				 {
			
 
				 	struct btrfs_fs_devices *fs_devs;
			
 
				 
			
 
				-	fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
			
 
				+	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
			
 
				 	if (!fs_devs)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
@@ -220,7 +220,7 @@ static struct btrfs_device *__alloc_device(void)
 
				 {
			
 
				 	struct btrfs_device *dev;
			
 
				 
			
 
				-	dev = kzalloc(sizeof(*dev), GFP_NOFS);
			
 
				+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
			
 
				 	if (!dev)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
@@ -733,7 +733,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 
				 		 * uuid mutex so nothing we touch in here is going to disappear.
			
 
				 		 */
			
 
				 		if (orig_dev->name) {
			
 
				-			name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
			
 
				+			name = rcu_string_strdup(orig_dev->name->str,
			
 
				+					GFP_KERNEL);
			
 
				 			if (!name) {
			
 
				 				kfree(device);
			
 
				 				goto error;
			
@@ -1714,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
				 	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
			
 
				 
			
 
				 	num_devices = root->fs_info->fs_devices->num_devices;
			
 
				-	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
			
 
				 	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
			
 
				 		WARN_ON(num_devices < 1);
			
 
				 		num_devices--;
			
 
				 	}
			
 
				-	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
			
 
				 
			
 
				 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
			
 
				 		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
			
@@ -2287,7 +2288,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
				 		goto error;
			
 
				 	}
			
 
				 
			
 
				-	name = rcu_string_strdup(device_path, GFP_NOFS);
			
 
				+	name = rcu_string_strdup(device_path, GFP_KERNEL);
			
 
				 	if (!name) {
			
 
				 		kfree(device);
			
 
				 		ret = -ENOMEM;
			
@@ -2748,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 
				 	    em->start + em->len < chunk_offset) {
			
 
				 		/*
			
 
				 		 * This is a logic error, but we don't want to just rely on the
			
 
				-		 * user having built with ASSERT enabled, so if ASSERT doens't
			
 
				+		 * user having built with ASSERT enabled, so if ASSERT doesn't
			
 
				 		 * do anything we still error out.
			
 
				 		 */
			
 
				 		ASSERT(0);
			
@@ -2966,7 +2967,7 @@ static int insert_balance_item(struct btrfs_root *root,
 
				 	}
			
 
				 
			
 
				 	key.objectid = BTRFS_BALANCE_OBJECTID;
			
 
				-	key.type = BTRFS_BALANCE_ITEM_KEY;
			
 
				+	key.type = BTRFS_TEMPORARY_ITEM_KEY;
			
 
				 	key.offset = 0;
			
 
				 
			
 
				 	ret = btrfs_insert_empty_item(trans, root, path, &key,
			
@@ -3015,7 +3016,7 @@ static int del_balance_item(struct btrfs_root *root)
 
				 	}
			
 
				 
			
 
				 	key.objectid = BTRFS_BALANCE_OBJECTID;
			
 
				-	key.type = BTRFS_BALANCE_ITEM_KEY;
			
 
				+	key.type = BTRFS_TEMPORARY_ITEM_KEY;
			
 
				 	key.offset = 0;
			
 
				 
			
 
				 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
			
@@ -3686,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
				 	}
			
 
				 
			
 
				 	num_devices = fs_info->fs_devices->num_devices;
			
 
				-	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
			
 
				 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
			
 
				 		BUG_ON(num_devices < 1);
			
 
				 		num_devices--;
			
 
				 	}
			
 
				-	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
			
 
				 	if (num_devices == 1)
			
 
				 		allowed |= BTRFS_BLOCK_GROUP_DUP;
			
@@ -3867,7 +3868,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	key.objectid = BTRFS_BALANCE_OBJECTID;
			
 
				-	key.type = BTRFS_BALANCE_ITEM_KEY;
			
 
				+	key.type = BTRFS_TEMPORARY_ITEM_KEY;
			
 
				 	key.offset = 0;
			
 
				 
			
 
				 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
			
@@ -4118,7 +4119,7 @@ static int btrfs_uuid_scan_kthread(void *data)
 
				  * Callback for btrfs_uuid_tree_iterate().
			
 
				  * returns:
			
 
				  * 0	check succeeded, the entry is not outdated.
			
 
				- * < 0	if an error occured.
			
 
				+ * < 0	if an error occurred.
			
 
				  * > 0	if the check failed, which means the caller shall remove the entry.
			
 
				  */
			
 
				 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
			
@@ -5062,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 
				 		ret = 1;
			
 
				 	free_extent_map(em);
			
 
				 
			
 
				-	btrfs_dev_replace_lock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
			
 
				 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
			
 
				 		ret++;
			
 
				-	btrfs_dev_replace_unlock(&fs_info->dev_replace);
			
 
				+	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -5325,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 	if (!bbio_ret)
			
 
				 		goto out;
			
 
				 
			
 
				-	btrfs_dev_replace_lock(dev_replace);
			
 
				+	btrfs_dev_replace_lock(dev_replace, 0);
			
 
				 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
			
 
				 	if (!dev_replace_is_ongoing)
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				+	else
			
 
				+		btrfs_dev_replace_set_lock_blocking(dev_replace);
			
 
				 
			
 
				 	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
			
 
				 	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
			
@@ -5751,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
				 		bbio->mirror_num = map->num_stripes + 1;
			
 
				 	}
			
 
				 out:
			
 
				-	if (dev_replace_is_ongoing)
			
 
				-		btrfs_dev_replace_unlock(dev_replace);
			
 
				+	if (dev_replace_is_ongoing) {
			
 
				+		btrfs_dev_replace_clear_lock_blocking(dev_replace);
			
 
				+		btrfs_dev_replace_unlock(dev_replace, 0);
			
 
				+	}
			
 
				 	free_extent_map(em);
			
 
				 	return ret;
			
 
				 }
			
@@ -6705,8 +6710,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
 
				 		int item_size;
			
 
				 		struct btrfs_dev_stats_item *ptr;
			
 
				 
			
 
				-		key.objectid = 0;
			
 
				-		key.type = BTRFS_DEV_STATS_KEY;
			
 
				+		key.objectid = BTRFS_DEV_STATS_OBJECTID;
			
 
				+		key.type = BTRFS_PERSISTENT_ITEM_KEY;
			
 
				 		key.offset = device->devid;
			
 
				 		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
			
 
				 		if (ret) {
			
@@ -6753,8 +6758,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 
				 	int ret;
			
 
				 	int i;
			
 
				 
			
 
				-	key.objectid = 0;
			
 
				-	key.type = BTRFS_DEV_STATS_KEY;
			
 
				+	key.objectid = BTRFS_DEV_STATS_OBJECTID;
			
 
				+	key.type = BTRFS_PERSISTENT_ITEM_KEY;
			
 
				 	key.offset = device->devid;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -249,7 +249,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 
				 		goto out;
			
 
				 
			
 
				 	inode_inc_iversion(inode);
			
 
				-	inode->i_ctime = CURRENT_TIME;
			
 
				+	inode->i_ctime = current_fs_time(inode->i_sb);
			
 
				 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
			
 
				 	ret = btrfs_update_inode(trans, root, inode);
			
 
				 	BUG_ON(ret);
			
@@ -260,16 +260,12 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 
				 
			
 
				 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
			
 
				 {
			
 
				-	struct btrfs_key key, found_key;
			
 
				+	struct btrfs_key key;
			
 
				 	struct inode *inode = d_inode(dentry);
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				 	struct btrfs_path *path;
			
 
				-	struct extent_buffer *leaf;
			
 
				-	struct btrfs_dir_item *di;
			
 
				-	int ret = 0, slot;
			
 
				+	int ret = 0;
			
 
				 	size_t total_size = 0, size_left = size;
			
 
				-	unsigned long name_ptr;
			
 
				-	size_t name_len;
			
 
				 
			
 
				 	/*
			
 
				 	 * ok we want all objects associated with this id.
			
@@ -291,6 +287,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
				 		goto err;
			
 
				 
			
 
				 	while (1) {
			
 
				+		struct extent_buffer *leaf;
			
 
				+		int slot;
			
 
				+		struct btrfs_dir_item *di;
			
 
				+		struct btrfs_key found_key;
			
 
				+		u32 item_size;
			
 
				+		u32 cur;
			
 
				+
			
 
				 		leaf = path->nodes[0];
			
 
				 		slot = path->slots[0];
			
 
				 
			
@@ -316,31 +319,45 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
				 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
			
 
				 			break;
			
 
				 		if (found_key.type < BTRFS_XATTR_ITEM_KEY)
			
 
				-			goto next;
			
 
				+			goto next_item;
			
 
				 
			
 
				 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
			
 
				-		if (verify_dir_item(root, leaf, di))
			
 
				-			goto next;
			
 
				-
			
 
				-		name_len = btrfs_dir_name_len(leaf, di);
			
 
				-		total_size += name_len + 1;
			
 
				+		item_size = btrfs_item_size_nr(leaf, slot);
			
 
				+		cur = 0;
			
 
				+		while (cur < item_size) {
			
 
				+			u16 name_len = btrfs_dir_name_len(leaf, di);
			
 
				+			u16 data_len = btrfs_dir_data_len(leaf, di);
			
 
				+			u32 this_len = sizeof(*di) + name_len + data_len;
			
 
				+			unsigned long name_ptr = (unsigned long)(di + 1);
			
 
				+
			
 
				+			if (verify_dir_item(root, leaf, di)) {
			
 
				+				ret = -EIO;
			
 
				+				goto err;
			
 
				+			}
			
 
				 
			
 
				-		/* we are just looking for how big our buffer needs to be */
			
 
				-		if (!size)
			
 
				-			goto next;
			
 
				+			total_size += name_len + 1;
			
 
				+			/*
			
 
				+			 * We are just looking for how big our buffer needs to
			
 
				+			 * be.
			
 
				+			 */
			
 
				+			if (!size)
			
 
				+				goto next;
			
 
				 
			
 
				-		if (!buffer || (name_len + 1) > size_left) {
			
 
				-			ret = -ERANGE;
			
 
				-			goto err;
			
 
				-		}
			
 
				+			if (!buffer || (name_len + 1) > size_left) {
			
 
				+				ret = -ERANGE;
			
 
				+				goto err;
			
 
				+			}
			
 
				 
			
 
				-		name_ptr = (unsigned long)(di + 1);
			
 
				-		read_extent_buffer(leaf, buffer, name_ptr, name_len);
			
 
				-		buffer[name_len] = '\0';
			
 
				+			read_extent_buffer(leaf, buffer, name_ptr, name_len);
			
 
				+			buffer[name_len] = '\0';
			
 
				 
			
 
				-		size_left -= name_len + 1;
			
 
				-		buffer += name_len + 1;
			
 
				+			size_left -= name_len + 1;
			
 
				+			buffer += name_len + 1;
			
 
				 next:
			
 
				+			cur += this_len;
			
 
				+			di = (struct btrfs_dir_item *)((char *)di + this_len);
			
 
				+		}
			
 
				+next_item:
			
 
				 		path->slots[0]++;
			
 
				 	}
			
 
				 	ret = total_size;