10 жил өмнө · 6597ac8a51
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -25,10 +25,10 @@ trying to see when the io scheduler has let the ios run.
 
				 Overview of supplied cache replacement policies
			
 
				 ===============================================
			
 
				 
			
 
				-multiqueue
			
 
				-----------
			
 
				+multiqueue (mq)
			
 
				+---------------
			
 
				 
			
 
				-This policy is the default.
			
 
				+This policy has been deprecated in favor of the smq policy (see below).
			
 
				 
			
 
				 The multiqueue policy has three sets of 16 queues: one set for entries
			
 
				 waiting for the cache and another two for those in the cache (a set for
			
@@ -73,6 +73,67 @@ If you're trying to quickly warm a new cache device you may wish to
 
				 reduce these to encourage promotion.  Remember to switch them back to
			
 
				 their defaults after the cache fills though.
			
 
				 
			
 
				+Stochastic multiqueue (smq)
			
 
				+---------------------------
			
 
				+
			
 
				+This policy is the default.
			
 
				+
			
 
				+The stochastic multi-queue (smq) policy addresses some of the problems
			
 
				+with the multiqueue (mq) policy.
			
 
				+
			
 
				+The smq policy (vs mq) offers the promise of less memory utilization,
			
 
				+improved performance and increased adaptability in the face of changing
			
 
				+workloads.  SMQ also does not have any cumbersome tuning knobs.
			
 
				+
			
 
				+Users may switch from "mq" to "smq" simply by appropriately reloading a
			
 
				+DM table that is using the cache target.  Doing so will cause all of the
			
 
				+mq policy's hints to be dropped.  Also, performance of the cache may
			
 
				+degrade slightly until smq recalculates the origin device's hotspots
			
 
				+that should be cached.
			
 
				+
			
 
				+Memory usage:
			
 
				+The mq policy uses a lot of memory; 88 bytes per cache block on a 64
			
 
				+bit machine.
			
 
				+
			
 
				+SMQ uses 28bit indexes to implement it's data structures rather than
			
 
				+pointers.  It avoids storing an explicit hit count for each block.  It
			
 
				+has a 'hotspot' queue rather than a pre cache which uses a quarter of
			
 
				+the entries (each hotspot block covers a larger area than a single
			
 
				+cache block).
			
 
				+
			
 
				+All these mean smq uses ~25bytes per cache block.  Still a lot of
			
 
				+memory, but a substantial improvement nontheless.
			
 
				+
			
 
				+Level balancing:
			
 
				+MQ places entries in different levels of the multiqueue structures
			
 
				+based on their hit count (~ln(hit count)).  This means the bottom
			
 
				+levels generally have the most entries, and the top ones have very
			
 
				+few.  Having unbalanced levels like this reduces the efficacy of the
			
 
				+multiqueue.
			
 
				+
			
 
				+SMQ does not maintain a hit count, instead it swaps hit entries with
			
 
				+the least recently used entry from the level above.  The over all
			
 
				+ordering being a side effect of this stochastic process.  With this
			
 
				+scheme we can decide how many entries occupy each multiqueue level,
			
 
				+resulting in better promotion/demotion decisions.
			
 
				+
			
 
				+Adaptability:
			
 
				+The MQ policy maintains a hit count for each cache block.  For a
			
 
				+different block to get promoted to the cache it's hit count has to
			
 
				+exceed the lowest currently in the cache.  This means it can take a
			
 
				+long time for the cache to adapt between varying IO patterns.
			
 
				+Periodically degrading the hit counts could help with this, but I
			
 
				+haven't found a nice general solution.
			
 
				+
			
 
				+SMQ doesn't maintain hit counts, so a lot of this problem just goes
			
 
				+away.  In addition it tracks performance of the hotspot queue, which
			
 
				+is used to decide which blocks to promote.  If the hotspot queue is
			
 
				+performing badly then it starts moving entries more quickly between
			
 
				+levels.  This lets it adapt to new IO patterns very quickly.
			
 
				+
			
 
				+Performance:
			
 
				+Testing SMQ shows substantially better performance than MQ.
			
 
				+
			
 
				 cleaner
			
 
				 -------
			
 
				 
			
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -221,6 +221,7 @@ Status
 
				 <#read hits> <#read misses> <#write hits> <#write misses>
			
 
				 <#demotions> <#promotions> <#dirty> <#features> <features>*
			
 
				 <#core args> <core args>* <policy name> <#policy args> <policy args>*
			
 
				+<cache metadata mode>
			
 
				 
			
 
				 metadata block size	 : Fixed block size for each metadata block in
			
 
				 			     sectors
			
@@ -251,8 +252,12 @@ core args		 : Key/value pairs for tuning the core
 
				 			     e.g. migration_threshold
			
 
				 policy name		 : Name of the policy
			
 
				 #policy args		 : Number of policy arguments to follow (must be even)
			
 
				-policy args		 : Key/value pairs
			
 
				-			     e.g. sequential_threshold
			
 
				+policy args		 : Key/value pairs e.g. sequential_threshold
			
 
				+cache metadata mode      : ro if read-only, rw if read-write
			
 
				+	In serious cases where even a read-only mode is deemed unsafe
			
 
				+	no further I/O will be permitted and the status will just
			
 
				+	contain the string 'Fail'.  The userspace recovery tools
			
 
				+	should then be used.
			
 
				 
			
 
				 Messages
			
 
				 --------
			
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -224,3 +224,5 @@ Version History
 
				 	New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
			
 
				 1.5.1   Add ability to restore transiently failed devices on resume.
			
 
				 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
			
 
				+1.6.0   Add discard support (and devices_handle_discard_safely module param).
			
 
				+1.7.0   Add support for MD RAID0 mappings.
			
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -13,9 +13,14 @@ the range specified.
 
				 The I/O statistics counters for each step-sized area of a region are
			
 
				 in the same format as /sys/block/*/stat or /proc/diskstats (see:
			
 
				 Documentation/iostats.txt).  But two extra counters (12 and 13) are
			
 
				-provided: total time spent reading and writing in milliseconds.	 All
			
 
				-these counters may be accessed by sending the @stats_print message to
			
 
				-the appropriate DM device via dmsetup.
			
 
				+provided: total time spent reading and writing.  When the histogram
			
 
				+argument is used, the 14th parameter is reported that represents the
			
 
				+histogram of latencies.  All these counters may be accessed by sending
			
 
				+the @stats_print message to the appropriate DM device via dmsetup.
			
 
				+
			
 
				+The reported times are in milliseconds and the granularity depends on
			
 
				+the kernel ticks.  When the option precise_timestamps is used, the
			
 
				+reported times are in nanoseconds.
			
 
				 
			
 
				 Each region has a corresponding unique identifier, which we call a
			
 
				 region_id, that is assigned when the region is created.	 The region_id
			
@@ -33,7 +38,9 @@ memory is used by reading
 
				 Messages
			
 
				 ========
			
 
				 
			
 
				-    @stats_create <range> <step> [<program_id> [<aux_data>]]
			
 
				+    @stats_create <range> <step>
			
 
				+		[<number_of_optional_arguments> <optional_arguments>...]
			
 
				+		[<program_id> [<aux_data>]]
			
 
				 
			
 
				 	Create a new region and return the region_id.
			
 
				 
			
@@ -48,6 +55,29 @@ Messages
 
				 	  "/<number_of_areas>" - the range is subdivided into the specified
			
 
				 				 number of areas.
			
 
				 
			
 
				+	<number_of_optional_arguments>
			
 
				+	  The number of optional arguments
			
 
				+
			
 
				+	<optional_arguments>
			
 
				+	  The following optional arguments are supported
			
 
				+	  precise_timestamps - use precise timer with nanosecond resolution
			
 
				+		instead of the "jiffies" variable.  When this argument is
			
 
				+		used, the resulting times are in nanoseconds instead of
			
 
				+		milliseconds.  Precise timestamps are a little bit slower
			
 
				+		to obtain than jiffies-based timestamps.
			
 
				+	  histogram:n1,n2,n3,n4,... - collect histogram of latencies.  The
			
 
				+		numbers n1, n2, etc are times that represent the boundaries
			
 
				+		of the histogram.  If precise_timestamps is not used, the
			
 
				+		times are in milliseconds, otherwise they are in
			
 
				+		nanoseconds.  For each range, the kernel will report the
			
 
				+		number of requests that completed within this range. For
			
 
				+		example, if we use "histogram:10,20,30", the kernel will
			
 
				+		report four numbers a:b:c:d. a is the number of requests
			
 
				+		that took 0-10 ms to complete, b is the number of requests
			
 
				+		that took 10-20 ms to complete, c is the number of requests
			
 
				+		that took 20-30 ms to complete and d is the number of
			
 
				+		requests that took more than 30 ms to complete.
			
 
				+
			
 
				 	<program_id>
			
 
				 	  An optional parameter.  A name that uniquely identifies
			
 
				 	  the userspace owner of the range.  This groups ranges together
			
@@ -55,6 +85,9 @@ Messages
 
				 	  created and ignore those created by others.
			
 
				 	  The kernel returns this string back in the output of
			
 
				 	  @stats_list message, but it doesn't use it for anything else.
			
 
				+	  If we omit the number of optional arguments, program id must not
			
 
				+	  be a number, otherwise it would be interpreted as the number of
			
 
				+	  optional arguments.
			
 
				 
			
 
				 	<aux_data>
			
 
				 	  An optional parameter.  A word that provides auxiliary data
			
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -304,6 +304,18 @@ config DM_CACHE_MQ
 
				          This is meant to be a general purpose policy.  It prioritises
			
 
				          reads over writes.
			
 
				 
			
 
				+config DM_CACHE_SMQ
			
 
				+       tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
			
 
				+       depends on DM_CACHE
			
 
				+       default y
			
 
				+       ---help---
			
 
				+         A cache policy that uses a multiqueue ordered by recent hits
			
 
				+         to select which blocks should be promoted and demoted.
			
 
				+         This is meant to be a general purpose policy.  It prioritises
			
 
				+         reads over writes.  This SMQ policy (vs MQ) offers the promise
			
 
				+         of less memory utilization, improved performance and increased
			
 
				+         adaptability in the face of changing workloads.
			
 
				+
			
 
				 config DM_CACHE_CLEANER
			
 
				        tristate "Cleaner Cache Policy (EXPERIMENTAL)"
			
 
				        depends on DM_CACHE
			
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,6 +13,7 @@ dm-log-userspace-y \
 
				 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
			
 
				 dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
			
 
				 dm-cache-mq-y   += dm-cache-policy-mq.o
			
 
				+dm-cache-smq-y   += dm-cache-policy-smq.o
			
 
				 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
			
 
				 dm-era-y	+= dm-era-target.o
			
 
				 md-mod-y	+= md.o bitmap.o
			
@@ -54,6 +55,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 
				 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
			
 
				 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
			
 
				 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
			
 
				+obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
			
 
				 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
			
 
				 obj-$(CONFIG_DM_ERA)		+= dm-era.o
			
 
				 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
			
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -255,6 +255,32 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_cell_visit_release);
			
 
				 
			
 
				+static int __promote_or_release(struct dm_bio_prison *prison,
			
 
				+				struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	if (bio_list_empty(&cell->bios)) {
			
 
				+		rb_erase(&cell->node, &prison->cells);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	cell->holder = bio_list_pop(&cell->bios);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
			
 
				+			       struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __promote_or_release(prison, cell);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
			
 
				+
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 #define DEFERRED_SET_SIZE 64
			
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -101,6 +101,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 
				 			   void (*visit_fn)(void *, struct dm_bio_prison_cell *),
			
 
				 			   void *context, struct dm_bio_prison_cell *cell);
			
 
				 
			
 
				+/*
			
 
				+ * Rather than always releasing the prisoners in a cell, the client may
			
 
				+ * want to promote one of them to be the new holder.  There is a race here
			
 
				+ * though between releasing an empty cell, and other threads adding new
			
 
				+ * inmates.  So this function makes the decision with its lock held.
			
 
				+ *
			
 
				+ * This function can have two outcomes:
			
 
				+ * i) An inmate is promoted to be the holder of the cell (return value of 0).
			
 
				+ * ii) The cell has no inmate for promotion and is released (return value of 1).
			
 
				+ */
			
 
				+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
			
 
				+			       struct dm_bio_prison_cell *cell);
			
 
				+
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 /*
			
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -39,6 +39,8 @@
 
				 enum superblock_flag_bits {
			
 
				 	/* for spotting crashes that would invalidate the dirty bitset */
			
 
				 	CLEAN_SHUTDOWN,
			
 
				+	/* metadata must be checked using the tools */
			
 
				+	NEEDS_CHECK,
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -107,6 +109,7 @@ struct dm_cache_metadata {
 
				 	struct dm_disk_bitset discard_info;
			
 
				 
			
 
				 	struct rw_semaphore root_lock;
			
 
				+	unsigned long flags;
			
 
				 	dm_block_t root;
			
 
				 	dm_block_t hint_root;
			
 
				 	dm_block_t discard_root;
			
@@ -129,6 +132,14 @@ struct dm_cache_metadata {
 
				 	 * buffer before the superblock is locked and updated.
			
 
				 	 */
			
 
				 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
			
 
				+
			
 
				+	/*
			
 
				+	 * Set if a transaction has to be aborted but the attempt to roll
			
 
				+	 * back to the previous (good) transaction failed.  The only
			
 
				+	 * metadata operation permissible in this state is the closing of
			
 
				+	 * the device.
			
 
				+	 */
			
 
				+	bool fail_io:1;
			
 
				 };
			
 
				 
			
 
				 /*-------------------------------------------------------------------
			
@@ -527,6 +538,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
 
				 static void read_superblock_fields(struct dm_cache_metadata *cmd,
			
 
				 				   struct cache_disk_superblock *disk_super)
			
 
				 {
			
 
				+	cmd->flags = le32_to_cpu(disk_super->flags);
			
 
				 	cmd->root = le64_to_cpu(disk_super->mapping_root);
			
 
				 	cmd->hint_root = le64_to_cpu(disk_super->hint_root);
			
 
				 	cmd->discard_root = le64_to_cpu(disk_super->discard_root);
			
@@ -625,6 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
 
				 	if (mutator)
			
 
				 		update_flags(disk_super, mutator);
			
 
				 
			
 
				+	disk_super->flags = cpu_to_le32(cmd->flags);
			
 
				 	disk_super->mapping_root = cpu_to_le64(cmd->root);
			
 
				 	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
			
 
				 	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
			
@@ -693,6 +706,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
 
				 	cmd->cache_blocks = 0;
			
 
				 	cmd->policy_hint_size = policy_hint_size;
			
 
				 	cmd->changed = true;
			
 
				+	cmd->fail_io = false;
			
 
				 
			
 
				 	r = __create_persistent_data_objects(cmd, may_format_device);
			
 
				 	if (r) {
			
@@ -796,7 +810,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
 
				 		list_del(&cmd->list);
			
 
				 		mutex_unlock(&table_lock);
			
 
				 
			
 
				-		__destroy_persistent_data_objects(cmd);
			
 
				+		if (!cmd->fail_io)
			
 
				+			__destroy_persistent_data_objects(cmd);
			
 
				 		kfree(cmd);
			
 
				 	}
			
 
				 }
			
@@ -848,13 +863,26 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#define WRITE_LOCK(cmd) \
			
 
				+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
			
 
				+		return -EINVAL; \
			
 
				+	down_write(&cmd->root_lock)
			
 
				+
			
 
				+#define WRITE_LOCK_VOID(cmd) \
			
 
				+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
			
 
				+		return; \
			
 
				+	down_write(&cmd->root_lock)
			
 
				+
			
 
				+#define WRITE_UNLOCK(cmd) \
			
 
				+	up_write(&cmd->root_lock)
			
 
				+
			
 
				 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
			
 
				 {
			
 
				 	int r;
			
 
				 	bool clean;
			
 
				 	__le64 null_mapping = pack_value(0, 0);
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	__dm_bless_for_disk(&null_mapping);
			
 
				 
			
 
				 	if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
			
@@ -880,7 +908,7 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 
				 	cmd->changed = true;
			
 
				 
			
 
				 out:
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -891,7 +919,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = dm_bitset_resize(&cmd->discard_info,
			
 
				 			     cmd->discard_root,
			
 
				 			     from_dblock(cmd->discard_nr_blocks),
			
@@ -903,7 +931,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 
				 	}
			
 
				 
			
 
				 	cmd->changed = true;
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -946,9 +974,9 @@ int dm_cache_set_discard(struct dm_cache_metadata *cmd,
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = __discard(cmd, dblock, discard);
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1020,9 +1048,9 @@ int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = __remove(cmd, cblock);
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1048,9 +1076,9 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = __insert(cmd, cblock, oblock);
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1234,9 +1262,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = __dirty(cmd, cblock, dirty);
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1252,9 +1280,9 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 
				 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
			
 
				 				 struct dm_cache_statistics *stats)
			
 
				 {
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK_VOID(cmd);
			
 
				 	cmd->stats = *stats;
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 }
			
 
				 
			
 
				 int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
			
@@ -1263,7 +1291,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
 
				 	flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
			
 
				 				 clear_clean_shutdown);
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = __commit_transaction(cmd, mutator);
			
 
				 	if (r)
			
 
				 		goto out;
			
@@ -1271,7 +1299,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
 
				 	r = __begin_transaction(cmd);
			
 
				 
			
 
				 out:
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 	return r;
			
 
				 }
			
 
				 
			
@@ -1376,9 +1404,9 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	down_write(&cmd->root_lock);
			
 
				+	WRITE_LOCK(cmd);
			
 
				 	r = write_hints(cmd, policy);
			
 
				-	up_write(&cmd->root_lock);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1387,3 +1415,70 @@ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
 
				 {
			
 
				 	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
			
 
				 }
			
 
				+
			
 
				+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	WRITE_LOCK_VOID(cmd);
			
 
				+	dm_bm_set_read_only(cmd->bm);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				+}
			
 
				+
			
 
				+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	WRITE_LOCK_VOID(cmd);
			
 
				+	dm_bm_set_read_write(cmd->bm);
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				+}
			
 
				+
			
 
				+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *sblock;
			
 
				+	struct cache_disk_superblock *disk_super;
			
 
				+
			
 
				+	/*
			
 
				+	 * We ignore fail_io for this function.
			
 
				+	 */
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	set_bit(NEEDS_CHECK, &cmd->flags);
			
 
				+
			
 
				+	r = superblock_lock(cmd, &sblock);
			
 
				+	if (r) {
			
 
				+		DMERR("couldn't read superblock");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	disk_super = dm_block_data(sblock);
			
 
				+	disk_super->flags = cpu_to_le32(cmd->flags);
			
 
				+
			
 
				+	dm_bm_unlock(sblock);
			
 
				+
			
 
				+out:
			
 
				+	up_write(&cmd->root_lock);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	bool needs_check;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return needs_check;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	WRITE_LOCK(cmd);
			
 
				+	__destroy_persistent_data_objects(cmd);
			
 
				+	r = __create_persistent_data_objects(cmd, false);
			
 
				+	if (r)
			
 
				+		cmd->fail_io = true;
			
 
				+	WRITE_UNLOCK(cmd);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -102,6 +102,10 @@ struct dm_cache_statistics {
 
				 
			
 
				 void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
			
 
				 				 struct dm_cache_statistics *stats);
			
 
				+
			
 
				+/*
			
 
				+ * 'void' because it's no big deal if it fails.
			
 
				+ */
			
 
				 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
			
 
				 				 struct dm_cache_statistics *stats);
			
 
				 
			
@@ -133,6 +137,12 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 
				  */
			
 
				 int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
			
 
				 
			
 
				+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
			
 
				+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
			
 
				+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
			
 
				+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
			
 
				+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
			
 
				+
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 #endif /* DM_CACHE_METADATA_H */
			
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -171,7 +171,8 @@ static void remove_cache_hash_entry(struct wb_cache_entry *e)
 
				 /* Public interface (see dm-cache-policy.h */
			
 
				 static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
			
 
				 		  bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-		  struct bio *bio, struct policy_result *result)
			
 
				+		  struct bio *bio, struct policy_locker *locker,
			
 
				+		  struct policy_result *result)
			
 
				 {
			
 
				 	struct policy *p = to_policy(pe);
			
 
				 	struct wb_cache_entry *e;
			
@@ -358,7 +359,8 @@ static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
 
				 
			
 
				 static int wb_writeback_work(struct dm_cache_policy *pe,
			
 
				 			     dm_oblock_t *oblock,
			
 
				-			     dm_cblock_t *cblock)
			
 
				+			     dm_cblock_t *cblock,
			
 
				+			     bool critical_only)
			
 
				 {
			
 
				 	int r = -ENOENT;
			
 
				 	struct policy *p = to_policy(pe);
			
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -7,6 +7,7 @@
 
				 #ifndef DM_CACHE_POLICY_INTERNAL_H
			
 
				 #define DM_CACHE_POLICY_INTERNAL_H
			
 
				 
			
 
				+#include <linux/vmalloc.h>
			
 
				 #include "dm-cache-policy.h"
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
@@ -16,9 +17,10 @@
 
				  */
			
 
				 static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				 			     bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-			     struct bio *bio, struct policy_result *result)
			
 
				+			     struct bio *bio, struct policy_locker *locker,
			
 
				+			     struct policy_result *result)
			
 
				 {
			
 
				-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
			
 
				+	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
			
 
				 }
			
 
				 
			
 
				 static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
			
@@ -54,9 +56,10 @@ static inline int policy_walk_mappings(struct dm_cache_policy *p,
 
				 
			
 
				 static inline int policy_writeback_work(struct dm_cache_policy *p,
			
 
				 					dm_oblock_t *oblock,
			
 
				-					dm_cblock_t *cblock)
			
 
				+					dm_cblock_t *cblock,
			
 
				+					bool critical_only)
			
 
				 {
			
 
				-	return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
			
 
				+	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
			
 
				 }
			
 
				 
			
 
				 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
			
@@ -80,19 +83,21 @@ static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
 
				 	return p->residency(p);
			
 
				 }
			
 
				 
			
 
				-static inline void policy_tick(struct dm_cache_policy *p)
			
 
				+static inline void policy_tick(struct dm_cache_policy *p, bool can_block)
			
 
				 {
			
 
				 	if (p->tick)
			
 
				-		return p->tick(p);
			
 
				+		return p->tick(p, can_block);
			
 
				 }
			
 
				 
			
 
				-static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
			
 
				+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
			
 
				+					    unsigned maxlen, ssize_t *sz_ptr)
			
 
				 {
			
 
				-	ssize_t sz = 0;
			
 
				+	ssize_t sz = *sz_ptr;
			
 
				 	if (p->emit_config_values)
			
 
				-		return p->emit_config_values(p, result, maxlen);
			
 
				+		return p->emit_config_values(p, result, maxlen, sz_ptr);
			
 
				 
			
 
				-	DMEMIT("0");
			
 
				+	DMEMIT("0 ");
			
 
				+	*sz_ptr = sz;
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -104,6 +109,33 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				+/*
			
 
				+ * Some utility functions commonly used by policies and the core target.
			
 
				+ */
			
 
				+static inline size_t bitset_size_in_bytes(unsigned nr_entries)
			
 
				+{
			
 
				+	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long *alloc_bitset(unsigned nr_entries)
			
 
				+{
			
 
				+	size_t s = bitset_size_in_bytes(nr_entries);
			
 
				+	return vzalloc(s);
			
 
				+}
			
 
				+
			
 
				+static inline void clear_bitset(void *bitset, unsigned nr_entries)
			
 
				+{
			
 
				+	size_t s = bitset_size_in_bytes(nr_entries);
			
 
				+	memset(bitset, 0, s);
			
 
				+}
			
 
				+
			
 
				+static inline void free_bitset(unsigned long *bits)
			
 
				+{
			
 
				+	vfree(bits);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				 /*
			
 
				  * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
			
 
				  */
			
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -693,9 +693,10 @@ static void requeue(struct mq_policy *mq, struct entry *e)
 
				  * - set the hit count to a hard coded value other than 1, eg, is it better
			
 
				  *   if it goes in at level 2?
			
 
				  */
			
 
				-static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
			
 
				+static int demote_cblock(struct mq_policy *mq,
			
 
				+			 struct policy_locker *locker, dm_oblock_t *oblock)
			
 
				 {
			
 
				-	struct entry *demoted = pop(mq, &mq->cache_clean);
			
 
				+	struct entry *demoted = peek(&mq->cache_clean);
			
 
				 
			
 
				 	if (!demoted)
			
 
				 		/*
			
@@ -707,6 +708,13 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 
				 		 */
			
 
				 		return -ENOSPC;
			
 
				 
			
 
				+	if (locker->fn(locker, demoted->oblock))
			
 
				+		/*
			
 
				+		 * We couldn't lock the demoted block.
			
 
				+		 */
			
 
				+		return -EBUSY;
			
 
				+
			
 
				+	del(mq, demoted);
			
 
				 	*oblock = demoted->oblock;
			
 
				 	free_entry(&mq->cache_pool, demoted);
			
 
				 
			
@@ -795,6 +803,7 @@ static int cache_entry_found(struct mq_policy *mq,
 
				  * finding which cache block to use.
			
 
				  */
			
 
				 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
			
 
				+			      struct policy_locker *locker,
			
 
				 			      struct policy_result *result)
			
 
				 {
			
 
				 	int r;
			
@@ -803,11 +812,12 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
				 	/* Ensure there's a free cblock in the cache */
			
 
				 	if (epool_empty(&mq->cache_pool)) {
			
 
				 		result->op = POLICY_REPLACE;
			
 
				-		r = demote_cblock(mq, &result->old_oblock);
			
 
				+		r = demote_cblock(mq, locker, &result->old_oblock);
			
 
				 		if (r) {
			
 
				 			result->op = POLICY_MISS;
			
 
				 			return 0;
			
 
				 		}
			
 
				+
			
 
				 	} else
			
 
				 		result->op = POLICY_NEW;
			
 
				 
			
@@ -829,7 +839,8 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
				 
			
 
				 static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
			
 
				 				 bool can_migrate, bool discarded_oblock,
			
 
				-				 int data_dir, struct policy_result *result)
			
 
				+				 int data_dir, struct policy_locker *locker,
			
 
				+				 struct policy_result *result)
			
 
				 {
			
 
				 	int r = 0;
			
 
				 
			
@@ -842,7 +853,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 
				 
			
 
				 	else {
			
 
				 		requeue(mq, e);
			
 
				-		r = pre_cache_to_cache(mq, e, result);
			
 
				+		r = pre_cache_to_cache(mq, e, locker, result);
			
 
				 	}
			
 
				 
			
 
				 	return r;
			
@@ -872,6 +883,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 
				 }
			
 
				 
			
 
				 static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
			
 
				+			    struct policy_locker *locker,
			
 
				 			    struct policy_result *result)
			
 
				 {
			
 
				 	int r;
			
@@ -879,7 +891,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
				 
			
 
				 	if (epool_empty(&mq->cache_pool)) {
			
 
				 		result->op = POLICY_REPLACE;
			
 
				-		r = demote_cblock(mq, &result->old_oblock);
			
 
				+		r = demote_cblock(mq, locker, &result->old_oblock);
			
 
				 		if (unlikely(r)) {
			
 
				 			result->op = POLICY_MISS;
			
 
				 			insert_in_pre_cache(mq, oblock);
			
@@ -907,11 +919,12 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
				 
			
 
				 static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
			
 
				 			  bool can_migrate, bool discarded_oblock,
			
 
				-			  int data_dir, struct policy_result *result)
			
 
				+			  int data_dir, struct policy_locker *locker,
			
 
				+			  struct policy_result *result)
			
 
				 {
			
 
				 	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
			
 
				 		if (can_migrate)
			
 
				-			insert_in_cache(mq, oblock, result);
			
 
				+			insert_in_cache(mq, oblock, locker, result);
			
 
				 		else
			
 
				 			return -EWOULDBLOCK;
			
 
				 	} else {
			
@@ -928,7 +941,8 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 
				  */
			
 
				 static int map(struct mq_policy *mq, dm_oblock_t oblock,
			
 
				 	       bool can_migrate, bool discarded_oblock,
			
 
				-	       int data_dir, struct policy_result *result)
			
 
				+	       int data_dir, struct policy_locker *locker,
			
 
				+	       struct policy_result *result)
			
 
				 {
			
 
				 	int r = 0;
			
 
				 	struct entry *e = hash_lookup(mq, oblock);
			
@@ -942,11 +956,11 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
 
				 
			
 
				 	else if (e)
			
 
				 		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
			
 
				-					  data_dir, result);
			
 
				+					  data_dir, locker, result);
			
 
				 
			
 
				 	else
			
 
				 		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
			
 
				-				   data_dir, result);
			
 
				+				   data_dir, locker, result);
			
 
				 
			
 
				 	if (r == -EWOULDBLOCK)
			
 
				 		result->op = POLICY_MISS;
			
@@ -1012,7 +1026,8 @@ static void copy_tick(struct mq_policy *mq)
 
				 
			
 
				 static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				 		  bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-		  struct bio *bio, struct policy_result *result)
			
 
				+		  struct bio *bio, struct policy_locker *locker,
			
 
				+		  struct policy_result *result)
			
 
				 {
			
 
				 	int r;
			
 
				 	struct mq_policy *mq = to_mq_policy(p);
			
@@ -1028,7 +1043,7 @@ static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 
				 
			
 
				 	iot_examine_bio(&mq->tracker, bio);
			
 
				 	r = map(mq, oblock, can_migrate, discarded_oblock,
			
 
				-		bio_data_dir(bio), result);
			
 
				+		bio_data_dir(bio), locker, result);
			
 
				 
			
 
				 	mutex_unlock(&mq->lock);
			
 
				 
			
@@ -1221,7 +1236,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 
				 }
			
 
				 
			
 
				 static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
			
 
				-			     dm_cblock_t *cblock)
			
 
				+			     dm_cblock_t *cblock, bool critical_only)
			
 
				 {
			
 
				 	int r;
			
 
				 	struct mq_policy *mq = to_mq_policy(p);
			
@@ -1268,7 +1283,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-static void mq_tick(struct dm_cache_policy *p)
			
 
				+static void mq_tick(struct dm_cache_policy *p, bool can_block)
			
 
				 {
			
 
				 	struct mq_policy *mq = to_mq_policy(p);
			
 
				 	unsigned long flags;
			
@@ -1276,6 +1291,12 @@ static void mq_tick(struct dm_cache_policy *p)
 
				 	spin_lock_irqsave(&mq->tick_lock, flags);
			
 
				 	mq->tick_protected++;
			
 
				 	spin_unlock_irqrestore(&mq->tick_lock, flags);
			
 
				+
			
 
				+	if (can_block) {
			
 
				+		mutex_lock(&mq->lock);
			
 
				+		copy_tick(mq);
			
 
				+		mutex_unlock(&mq->lock);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static int mq_set_config_value(struct dm_cache_policy *p,
			
@@ -1308,22 +1329,24 @@ static int mq_set_config_value(struct dm_cache_policy *p,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
			
 
				+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
			
 
				+				 unsigned maxlen, ssize_t *sz_ptr)
			
 
				 {
			
 
				-	ssize_t sz = 0;
			
 
				+	ssize_t sz = *sz_ptr;
			
 
				 	struct mq_policy *mq = to_mq_policy(p);
			
 
				 
			
 
				 	DMEMIT("10 random_threshold %u "
			
 
				 	       "sequential_threshold %u "
			
 
				 	       "discard_promote_adjustment %u "
			
 
				 	       "read_promote_adjustment %u "
			
 
				-	       "write_promote_adjustment %u",
			
 
				+	       "write_promote_adjustment %u ",
			
 
				 	       mq->tracker.thresholds[PATTERN_RANDOM],
			
 
				 	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
			
 
				 	       mq->discard_promote_adjustment,
			
 
				 	       mq->read_promote_adjustment,
			
 
				 	       mq->write_promote_adjustment);
			
 
				 
			
 
				+	*sz_ptr = sz;
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1408,21 +1431,12 @@ bad_pre_cache_init:
 
				 
			
 
				 static struct dm_cache_policy_type mq_policy_type = {
			
 
				 	.name = "mq",
			
 
				-	.version = {1, 3, 0},
			
 
				+	.version = {1, 4, 0},
			
 
				 	.hint_size = 4,
			
 
				 	.owner = THIS_MODULE,
			
 
				 	.create = mq_create
			
 
				 };
			
 
				 
			
 
				-static struct dm_cache_policy_type default_policy_type = {
			
 
				-	.name = "default",
			
 
				-	.version = {1, 3, 0},
			
 
				-	.hint_size = 4,
			
 
				-	.owner = THIS_MODULE,
			
 
				-	.create = mq_create,
			
 
				-	.real = &mq_policy_type
			
 
				-};
			
 
				-
			
 
				 static int __init mq_init(void)
			
 
				 {
			
 
				 	int r;
			
@@ -1432,36 +1446,21 @@ static int __init mq_init(void)
 
				 					   __alignof__(struct entry),
			
 
				 					   0, NULL);
			
 
				 	if (!mq_entry_cache)
			
 
				-		goto bad;
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				 	r = dm_cache_policy_register(&mq_policy_type);
			
 
				 	if (r) {
			
 
				 		DMERR("register failed %d", r);
			
 
				-		goto bad_register_mq;
			
 
				-	}
			
 
				-
			
 
				-	r = dm_cache_policy_register(&default_policy_type);
			
 
				-	if (!r) {
			
 
				-		DMINFO("version %u.%u.%u loaded",
			
 
				-		       mq_policy_type.version[0],
			
 
				-		       mq_policy_type.version[1],
			
 
				-		       mq_policy_type.version[2]);
			
 
				-		return 0;
			
 
				+		kmem_cache_destroy(mq_entry_cache);
			
 
				+		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				-	DMERR("register failed (as default) %d", r);
			
 
				-
			
 
				-	dm_cache_policy_unregister(&mq_policy_type);
			
 
				-bad_register_mq:
			
 
				-	kmem_cache_destroy(mq_entry_cache);
			
 
				-bad:
			
 
				-	return -ENOMEM;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static void __exit mq_exit(void)
			
 
				 {
			
 
				 	dm_cache_policy_unregister(&mq_policy_type);
			
 
				-	dm_cache_policy_unregister(&default_policy_type);
			
 
				 
			
 
				 	kmem_cache_destroy(mq_entry_cache);
			
 
				 }
			
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -0,0 +1,1791 @@
 
				+/*
			
 
				+ * Copyright (C) 2015 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-cache-policy.h"
			
 
				+#include "dm-cache-policy-internal.h"
			
 
				+#include "dm.h"
			
 
				+
			
 
				+#include <linux/hash.h>
			
 
				+#include <linux/jiffies.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/math64.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX "cache-policy-smq"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Safe division functions that return zero on divide by zero.
			
 
				+ */
			
 
				+static unsigned safe_div(unsigned n, unsigned d)
			
 
				+{
			
 
				+	return d ? n / d : 0u;
			
 
				+}
			
 
				+
			
 
				+static unsigned safe_mod(unsigned n, unsigned d)
			
 
				+{
			
 
				+	return d ? n % d : 0u;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct entry {
			
 
				+	unsigned hash_next:28;
			
 
				+	unsigned prev:28;
			
 
				+	unsigned next:28;
			
 
				+	unsigned level:7;
			
 
				+	bool dirty:1;
			
 
				+	bool allocated:1;
			
 
				+	bool sentinel:1;
			
 
				+
			
 
				+	dm_oblock_t oblock;
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define INDEXER_NULL ((1u << 28u) - 1u)
			
 
				+
			
 
				+/*
			
 
				+ * An entry_space manages a set of entries that we use for the queues.
			
 
				+ * The clean and dirty queues share entries, so this object is separate
			
 
				+ * from the queue itself.
			
 
				+ */
			
 
				+struct entry_space {
			
 
				+	struct entry *begin;
			
 
				+	struct entry *end;
			
 
				+};
			
 
				+
			
 
				+static int space_init(struct entry_space *es, unsigned nr_entries)
			
 
				+{
			
 
				+	if (!nr_entries) {
			
 
				+		es->begin = es->end = NULL;
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	es->begin = vzalloc(sizeof(struct entry) * nr_entries);
			
 
				+	if (!es->begin)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	es->end = es->begin + nr_entries;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void space_exit(struct entry_space *es)
			
 
				+{
			
 
				+	vfree(es->begin);
			
 
				+}
			
 
				+
			
 
				+static struct entry *__get_entry(struct entry_space *es, unsigned block)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	e = es->begin + block;
			
 
				+	BUG_ON(e >= es->end);
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+static unsigned to_index(struct entry_space *es, struct entry *e)
			
 
				+{
			
 
				+	BUG_ON(e < es->begin || e >= es->end);
			
 
				+	return e - es->begin;
			
 
				+}
			
 
				+
			
 
				+static struct entry *to_entry(struct entry_space *es, unsigned block)
			
 
				+{
			
 
				+	if (block == INDEXER_NULL)
			
 
				+		return NULL;
			
 
				+
			
 
				+	return __get_entry(es, block);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct ilist {
			
 
				+	unsigned nr_elts;	/* excluding sentinel entries */
			
 
				+	unsigned head, tail;
			
 
				+};
			
 
				+
			
 
				+static void l_init(struct ilist *l)
			
 
				+{
			
 
				+	l->nr_elts = 0;
			
 
				+	l->head = l->tail = INDEXER_NULL;
			
 
				+}
			
 
				+
			
 
				+static struct entry *l_head(struct entry_space *es, struct ilist *l)
			
 
				+{
			
 
				+	return to_entry(es, l->head);
			
 
				+}
			
 
				+
			
 
				+static struct entry *l_tail(struct entry_space *es, struct ilist *l)
			
 
				+{
			
 
				+	return to_entry(es, l->tail);
			
 
				+}
			
 
				+
			
 
				+static struct entry *l_next(struct entry_space *es, struct entry *e)
			
 
				+{
			
 
				+	return to_entry(es, e->next);
			
 
				+}
			
 
				+
			
 
				+static struct entry *l_prev(struct entry_space *es, struct entry *e)
			
 
				+{
			
 
				+	return to_entry(es, e->prev);
			
 
				+}
			
 
				+
			
 
				+static bool l_empty(struct ilist *l)
			
 
				+{
			
 
				+	return l->head == INDEXER_NULL;
			
 
				+}
			
 
				+
			
 
				+static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
			
 
				+{
			
 
				+	struct entry *head = l_head(es, l);
			
 
				+
			
 
				+	e->next = l->head;
			
 
				+	e->prev = INDEXER_NULL;
			
 
				+
			
 
				+	if (head)
			
 
				+		head->prev = l->head = to_index(es, e);
			
 
				+	else
			
 
				+		l->head = l->tail = to_index(es, e);
			
 
				+
			
 
				+	if (!e->sentinel)
			
 
				+		l->nr_elts++;
			
 
				+}
			
 
				+
			
 
				+static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
			
 
				+{
			
 
				+	struct entry *tail = l_tail(es, l);
			
 
				+
			
 
				+	e->next = INDEXER_NULL;
			
 
				+	e->prev = l->tail;
			
 
				+
			
 
				+	if (tail)
			
 
				+		tail->next = l->tail = to_index(es, e);
			
 
				+	else
			
 
				+		l->head = l->tail = to_index(es, e);
			
 
				+
			
 
				+	if (!e->sentinel)
			
 
				+		l->nr_elts++;
			
 
				+}
			
 
				+
			
 
				+static void l_add_before(struct entry_space *es, struct ilist *l,
			
 
				+			 struct entry *old, struct entry *e)
			
 
				+{
			
 
				+	struct entry *prev = l_prev(es, old);
			
 
				+
			
 
				+	if (!prev)
			
 
				+		l_add_head(es, l, e);
			
 
				+
			
 
				+	else {
			
 
				+		e->prev = old->prev;
			
 
				+		e->next = to_index(es, old);
			
 
				+		prev->next = old->prev = to_index(es, e);
			
 
				+
			
 
				+		if (!e->sentinel)
			
 
				+			l->nr_elts++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
			
 
				+{
			
 
				+	struct entry *prev = l_prev(es, e);
			
 
				+	struct entry *next = l_next(es, e);
			
 
				+
			
 
				+	if (prev)
			
 
				+		prev->next = e->next;
			
 
				+	else
			
 
				+		l->head = e->next;
			
 
				+
			
 
				+	if (next)
			
 
				+		next->prev = e->prev;
			
 
				+	else
			
 
				+		l->tail = e->prev;
			
 
				+
			
 
				+	if (!e->sentinel)
			
 
				+		l->nr_elts--;
			
 
				+}
			
 
				+
			
 
				+static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	for (e = l_tail(es, l); e; e = l_prev(es, e))
			
 
				+		if (!e->sentinel) {
			
 
				+			l_del(es, l, e);
			
 
				+			return e;
			
 
				+		}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * The stochastic-multi-queue is a set of lru lists stacked into levels.
			
 
				+ * Entries are moved up levels when they are used, which loosely orders the
			
 
				+ * most accessed entries in the top levels and least in the bottom.  This
			
 
				+ * structure is *much* better than a single lru list.
			
 
				+ */
			
 
				+#define MAX_LEVELS 64u
			
 
				+
			
 
				+struct queue {
			
 
				+	struct entry_space *es;
			
 
				+
			
 
				+	unsigned nr_elts;
			
 
				+	unsigned nr_levels;
			
 
				+	struct ilist qs[MAX_LEVELS];
			
 
				+
			
 
				+	/*
			
 
				+	 * We maintain a count of the number of entries we would like in each
			
 
				+	 * level.
			
 
				+	 */
			
 
				+	unsigned last_target_nr_elts;
			
 
				+	unsigned nr_top_levels;
			
 
				+	unsigned nr_in_top_levels;
			
 
				+	unsigned target_count[MAX_LEVELS];
			
 
				+};
			
 
				+
			
 
				+static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	q->es = es;
			
 
				+	q->nr_elts = 0;
			
 
				+	q->nr_levels = nr_levels;
			
 
				+
			
 
				+	for (i = 0; i < q->nr_levels; i++) {
			
 
				+		l_init(q->qs + i);
			
 
				+		q->target_count[i] = 0u;
			
 
				+	}
			
 
				+
			
 
				+	q->last_target_nr_elts = 0u;
			
 
				+	q->nr_top_levels = 0u;
			
 
				+	q->nr_in_top_levels = 0u;
			
 
				+}
			
 
				+
			
 
				+static unsigned q_size(struct queue *q)
			
 
				+{
			
 
				+	return q->nr_elts;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Insert an entry to the back of the given level.
			
 
				+ */
			
 
				+static void q_push(struct queue *q, struct entry *e)
			
 
				+{
			
 
				+	if (!e->sentinel)
			
 
				+		q->nr_elts++;
			
 
				+
			
 
				+	l_add_tail(q->es, q->qs + e->level, e);
			
 
				+}
			
 
				+
			
 
				+static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
			
 
				+{
			
 
				+	if (!e->sentinel)
			
 
				+		q->nr_elts++;
			
 
				+
			
 
				+	l_add_before(q->es, q->qs + e->level, old, e);
			
 
				+}
			
 
				+
			
 
				+static void q_del(struct queue *q, struct entry *e)
			
 
				+{
			
 
				+	l_del(q->es, q->qs + e->level, e);
			
 
				+	if (!e->sentinel)
			
 
				+		q->nr_elts--;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return the oldest entry of the lowest populated level.
			
 
				+ */
			
 
				+static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	max_level = min(max_level, q->nr_levels);
			
 
				+
			
 
				+	for (level = 0; level < max_level; level++)
			
 
				+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
			
 
				+			if (e->sentinel) {
			
 
				+				if (can_cross_sentinel)
			
 
				+					continue;
			
 
				+				else
			
 
				+					break;
			
 
				+			}
			
 
				+
			
 
				+			return e;
			
 
				+		}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct entry *q_pop(struct queue *q)
			
 
				+{
			
 
				+	struct entry *e = q_peek(q, q->nr_levels, true);
			
 
				+
			
 
				+	if (e)
			
 
				+		q_del(q, e);
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Pops an entry from a level that is not past a sentinel.
			
 
				+ */
			
 
				+static struct entry *q_pop_old(struct queue *q, unsigned max_level)
			
 
				+{
			
 
				+	struct entry *e = q_peek(q, max_level, false);
			
 
				+
			
 
				+	if (e)
			
 
				+		q_del(q, e);
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This function assumes there is a non-sentinel entry to pop.  It's only
			
 
				+ * used by redistribute, so we know this is true.  It also doesn't adjust
			
 
				+ * the q->nr_elts count.
			
 
				+ */
			
 
				+static struct entry *__redist_pop_from(struct queue *q, unsigned level)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	for (; level < q->nr_levels; level++)
			
 
				+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
			
 
				+			if (!e->sentinel) {
			
 
				+				l_del(q->es, q->qs + e->level, e);
			
 
				+				return e;
			
 
				+			}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
			
 
				+{
			
 
				+	unsigned level, nr_levels, entries_per_level, remainder;
			
 
				+
			
 
				+	BUG_ON(lbegin > lend);
			
 
				+	BUG_ON(lend > q->nr_levels);
			
 
				+	nr_levels = lend - lbegin;
			
 
				+	entries_per_level = safe_div(nr_elts, nr_levels);
			
 
				+	remainder = safe_mod(nr_elts, nr_levels);
			
 
				+
			
 
				+	for (level = lbegin; level < lend; level++)
			
 
				+		q->target_count[level] =
			
 
				+			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Typically we have fewer elements in the top few levels which allows us
			
 
				+ * to adjust the promote threshold nicely.
			
 
				+ */
			
 
				+static void q_set_targets(struct queue *q)
			
 
				+{
			
 
				+	if (q->last_target_nr_elts == q->nr_elts)
			
 
				+		return;
			
 
				+
			
 
				+	q->last_target_nr_elts = q->nr_elts;
			
 
				+
			
 
				+	if (q->nr_top_levels > q->nr_levels)
			
 
				+		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
			
 
				+
			
 
				+	else {
			
 
				+		q_set_targets_subrange_(q, q->nr_in_top_levels,
			
 
				+					q->nr_levels - q->nr_top_levels, q->nr_levels);
			
 
				+
			
 
				+		if (q->nr_in_top_levels < q->nr_elts)
			
 
				+			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
			
 
				+						0, q->nr_levels - q->nr_top_levels);
			
 
				+		else
			
 
				+			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void q_redistribute(struct queue *q)
			
 
				+{
			
 
				+	unsigned target, level;
			
 
				+	struct ilist *l, *l_above;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	q_set_targets(q);
			
 
				+
			
 
				+	for (level = 0u; level < q->nr_levels - 1u; level++) {
			
 
				+		l = q->qs + level;
			
 
				+		target = q->target_count[level];
			
 
				+
			
 
				+		/*
			
 
				+		 * Pull down some entries from the level above.
			
 
				+		 */
			
 
				+		while (l->nr_elts < target) {
			
 
				+			e = __redist_pop_from(q, level + 1u);
			
 
				+			if (!e) {
			
 
				+				/* bug in nr_elts */
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			e->level = level;
			
 
				+			l_add_tail(q->es, l, e);
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Push some entries up.
			
 
				+		 */
			
 
				+		l_above = q->qs + level + 1u;
			
 
				+		while (l->nr_elts > target) {
			
 
				+			e = l_pop_tail(q->es, l);
			
 
				+
			
 
				+			if (!e)
			
 
				+				/* bug in nr_elts */
			
 
				+				break;
			
 
				+
			
 
				+			e->level = level + 1u;
			
 
				+			l_add_head(q->es, l_above, e);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
			
 
				+{
			
 
				+	struct entry *de;
			
 
				+	unsigned new_level;
			
 
				+
			
 
				+	q_del(q, e);
			
 
				+
			
 
				+	if (extra_levels && (e->level < q->nr_levels - 1u)) {
			
 
				+		new_level = min(q->nr_levels - 1u, e->level + extra_levels);
			
 
				+		for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
			
 
				+			if (de->sentinel)
			
 
				+				continue;
			
 
				+
			
 
				+			q_del(q, de);
			
 
				+			de->level = e->level;
			
 
				+
			
 
				+			if (dest)
			
 
				+				q_push_before(q, dest, de);
			
 
				+			else
			
 
				+				q_push(q, de);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		e->level = new_level;
			
 
				+	}
			
 
				+
			
 
				+	q_push(q, e);
			
 
				+}
			
 
				+
			
 
				+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
			
 
				+{
			
 
				+	q_requeue_before(q, NULL, e, extra_levels);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define FP_SHIFT 8
			
 
				+#define SIXTEENTH (1u << (FP_SHIFT - 4u))
			
 
				+#define EIGHTH (1u << (FP_SHIFT - 3u))
			
 
				+
			
 
				+struct stats {
			
 
				+	unsigned hit_threshold;
			
 
				+	unsigned hits;
			
 
				+	unsigned misses;
			
 
				+};
			
 
				+
			
 
				+enum performance {
			
 
				+	Q_POOR,
			
 
				+	Q_FAIR,
			
 
				+	Q_WELL
			
 
				+};
			
 
				+
			
 
				+static void stats_init(struct stats *s, unsigned nr_levels)
			
 
				+{
			
 
				+	s->hit_threshold = (nr_levels * 3u) / 4u;
			
 
				+	s->hits = 0u;
			
 
				+	s->misses = 0u;
			
 
				+}
			
 
				+
			
 
				+static void stats_reset(struct stats *s)
			
 
				+{
			
 
				+	s->hits = s->misses = 0u;
			
 
				+}
			
 
				+
			
 
				+static void stats_level_accessed(struct stats *s, unsigned level)
			
 
				+{
			
 
				+	if (level >= s->hit_threshold)
			
 
				+		s->hits++;
			
 
				+	else
			
 
				+		s->misses++;
			
 
				+}
			
 
				+
			
 
				+static void stats_miss(struct stats *s)
			
 
				+{
			
 
				+	s->misses++;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * There are times when we don't have any confidence in the hotspot queue.
			
 
				+ * Such as when a fresh cache is created and the blocks have been spread
			
 
				+ * out across the levels, or if an io load changes.  We detect this by
			
 
				+ * seeing how often a lookup is in the top levels of the hotspot queue.
			
 
				+ */
			
 
				+static enum performance stats_assess(struct stats *s)
			
 
				+{
			
 
				+	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
			
 
				+
			
 
				+	if (confidence < SIXTEENTH)
			
 
				+		return Q_POOR;
			
 
				+
			
 
				+	else if (confidence < EIGHTH)
			
 
				+		return Q_FAIR;
			
 
				+
			
 
				+	else
			
 
				+		return Q_WELL;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct hash_table {
			
 
				+	struct entry_space *es;
			
 
				+	unsigned long long hash_bits;
			
 
				+	unsigned *buckets;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * All cache entries are stored in a chained hash table.  To save space we
			
 
				+ * use indexing again, and only store indexes to the next entry.
			
 
				+ */
			
 
				+static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
			
 
				+{
			
 
				+	unsigned i, nr_buckets;
			
 
				+
			
 
				+	ht->es = es;
			
 
				+	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
			
 
				+	ht->hash_bits = ffs(nr_buckets) - 1;
			
 
				+
			
 
				+	ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
			
 
				+	if (!ht->buckets)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	for (i = 0; i < nr_buckets; i++)
			
 
				+		ht->buckets[i] = INDEXER_NULL;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void h_exit(struct hash_table *ht)
			
 
				+{
			
 
				+	vfree(ht->buckets);
			
 
				+}
			
 
				+
			
 
				+static struct entry *h_head(struct hash_table *ht, unsigned bucket)
			
 
				+{
			
 
				+	return to_entry(ht->es, ht->buckets[bucket]);
			
 
				+}
			
 
				+
			
 
				+static struct entry *h_next(struct hash_table *ht, struct entry *e)
			
 
				+{
			
 
				+	return to_entry(ht->es, e->hash_next);
			
 
				+}
			
 
				+
			
 
				+static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
			
 
				+{
			
 
				+	e->hash_next = ht->buckets[bucket];
			
 
				+	ht->buckets[bucket] = to_index(ht->es, e);
			
 
				+}
			
 
				+
			
 
				+static void h_insert(struct hash_table *ht, struct entry *e)
			
 
				+{
			
 
				+	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
			
 
				+	__h_insert(ht, h, e);
			
 
				+}
			
 
				+
			
 
				+static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
			
 
				+				struct entry **prev)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	*prev = NULL;
			
 
				+	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
			
 
				+		if (e->oblock == oblock)
			
 
				+			return e;
			
 
				+
			
 
				+		*prev = e;
			
 
				+	}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void __h_unlink(struct hash_table *ht, unsigned h,
			
 
				+		       struct entry *e, struct entry *prev)
			
 
				+{
			
 
				+	if (prev)
			
 
				+		prev->hash_next = e->hash_next;
			
 
				+	else
			
 
				+		ht->buckets[h] = e->hash_next;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Also moves each entry to the front of the bucket.
			
 
				+ */
			
 
				+static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct entry *e, *prev;
			
 
				+	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
			
 
				+
			
 
				+	e = __h_lookup(ht, h, oblock, &prev);
			
 
				+	if (e && prev) {
			
 
				+		/*
			
 
				+		 * Move to the front because this entry is likely
			
 
				+		 * to be hit again.
			
 
				+		 */
			
 
				+		__h_unlink(ht, h, e, prev);
			
 
				+		__h_insert(ht, h, e);
			
 
				+	}
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+static void h_remove(struct hash_table *ht, struct entry *e)
			
 
				+{
			
 
				+	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
			
 
				+	struct entry *prev;
			
 
				+
			
 
				+	/*
			
 
				+	 * The down side of using a singly linked list is we have to
			
 
				+	 * iterate the bucket to remove an item.
			
 
				+	 */
			
 
				+	e = __h_lookup(ht, h, e->oblock, &prev);
			
 
				+	if (e)
			
 
				+		__h_unlink(ht, h, e, prev);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct entry_alloc {
			
 
				+	struct entry_space *es;
			
 
				+	unsigned begin;
			
 
				+
			
 
				+	unsigned nr_allocated;
			
 
				+	struct ilist free;
			
 
				+};
			
 
				+
			
 
				+static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
			
 
				+			   unsigned begin, unsigned end)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	ea->es = es;
			
 
				+	ea->nr_allocated = 0u;
			
 
				+	ea->begin = begin;
			
 
				+
			
 
				+	l_init(&ea->free);
			
 
				+	for (i = begin; i != end; i++)
			
 
				+		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
			
 
				+}
			
 
				+
			
 
				+static void init_entry(struct entry *e)
			
 
				+{
			
 
				+	/*
			
 
				+	 * We can't memset because that would clear the hotspot and
			
 
				+	 * sentinel bits which remain constant.
			
 
				+	 */
			
 
				+	e->hash_next = INDEXER_NULL;
			
 
				+	e->next = INDEXER_NULL;
			
 
				+	e->prev = INDEXER_NULL;
			
 
				+	e->level = 0u;
			
 
				+	e->allocated = true;
			
 
				+}
			
 
				+
			
 
				+static struct entry *alloc_entry(struct entry_alloc *ea)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if (l_empty(&ea->free))
			
 
				+		return NULL;
			
 
				+
			
 
				+	e = l_pop_tail(ea->es, &ea->free);
			
 
				+	init_entry(e);
			
 
				+	ea->nr_allocated++;
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This assumes the cblock hasn't already been allocated.
			
 
				+ */
			
 
				+static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
			
 
				+{
			
 
				+	struct entry *e = __get_entry(ea->es, ea->begin + i);
			
 
				+
			
 
				+	BUG_ON(e->allocated);
			
 
				+
			
 
				+	l_del(ea->es, &ea->free, e);
			
 
				+	init_entry(e);
			
 
				+	ea->nr_allocated++;
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+static void free_entry(struct entry_alloc *ea, struct entry *e)
			
 
				+{
			
 
				+	BUG_ON(!ea->nr_allocated);
			
 
				+	BUG_ON(!e->allocated);
			
 
				+
			
 
				+	ea->nr_allocated--;
			
 
				+	e->allocated = false;
			
 
				+	l_add_tail(ea->es, &ea->free, e);
			
 
				+}
			
 
				+
			
 
				+static bool allocator_empty(struct entry_alloc *ea)
			
 
				+{
			
 
				+	return l_empty(&ea->free);
			
 
				+}
			
 
				+
			
 
				+static unsigned get_index(struct entry_alloc *ea, struct entry *e)
			
 
				+{
			
 
				+	return to_index(ea->es, e) - ea->begin;
			
 
				+}
			
 
				+
			
 
				+static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
			
 
				+{
			
 
				+	return __get_entry(ea->es, ea->begin + index);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define NR_HOTSPOT_LEVELS 64u
			
 
				+#define NR_CACHE_LEVELS 64u
			
 
				+
			
 
				+#define WRITEBACK_PERIOD (10 * HZ)
			
 
				+#define DEMOTE_PERIOD (60 * HZ)
			
 
				+
			
 
				+#define HOTSPOT_UPDATE_PERIOD (HZ)
			
 
				+#define CACHE_UPDATE_PERIOD (10u * HZ)
			
 
				+
			
 
				+struct smq_policy {
			
 
				+	struct dm_cache_policy policy;
			
 
				+
			
 
				+	/* protects everything */
			
 
				+	struct mutex lock;
			
 
				+	dm_cblock_t cache_size;
			
 
				+	sector_t cache_block_size;
			
 
				+
			
 
				+	sector_t hotspot_block_size;
			
 
				+	unsigned nr_hotspot_blocks;
			
 
				+	unsigned cache_blocks_per_hotspot_block;
			
 
				+	unsigned hotspot_level_jump;
			
 
				+
			
 
				+	struct entry_space es;
			
 
				+	struct entry_alloc writeback_sentinel_alloc;
			
 
				+	struct entry_alloc demote_sentinel_alloc;
			
 
				+	struct entry_alloc hotspot_alloc;
			
 
				+	struct entry_alloc cache_alloc;
			
 
				+
			
 
				+	unsigned long *hotspot_hit_bits;
			
 
				+	unsigned long *cache_hit_bits;
			
 
				+
			
 
				+	/*
			
 
				+	 * We maintain three queues of entries.  The cache proper,
			
 
				+	 * consisting of a clean and dirty queue, containing the currently
			
 
				+	 * active mappings.  The hotspot queue uses a larger block size to
			
 
				+	 * track blocks that are being hit frequently and potential
			
 
				+	 * candidates for promotion to the cache.
			
 
				+	 */
			
 
				+	struct queue hotspot;
			
 
				+	struct queue clean;
			
 
				+	struct queue dirty;
			
 
				+
			
 
				+	struct stats hotspot_stats;
			
 
				+	struct stats cache_stats;
			
 
				+
			
 
				+	/*
			
 
				+	 * Keeps track of time, incremented by the core.  We use this to
			
 
				+	 * avoid attributing multiple hits within the same tick.
			
 
				+	 *
			
 
				+	 * Access to tick_protected should be done with the spin lock held.
			
 
				+	 * It's copied to tick at the start of the map function (within the
			
 
				+	 * mutex).
			
 
				+	 */
			
 
				+	spinlock_t tick_lock;
			
 
				+	unsigned tick_protected;
			
 
				+	unsigned tick;
			
 
				+
			
 
				+	/*
			
 
				+	 * The hash tables allows us to quickly find an entry by origin
			
 
				+	 * block.
			
 
				+	 */
			
 
				+	struct hash_table table;
			
 
				+	struct hash_table hotspot_table;
			
 
				+
			
 
				+	bool current_writeback_sentinels;
			
 
				+	unsigned long next_writeback_period;
			
 
				+
			
 
				+	bool current_demote_sentinels;
			
 
				+	unsigned long next_demote_period;
			
 
				+
			
 
				+	unsigned write_promote_level;
			
 
				+	unsigned read_promote_level;
			
 
				+
			
 
				+	unsigned long next_hotspot_period;
			
 
				+	unsigned long next_cache_period;
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
			
 
				+{
			
 
				+	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
			
 
				+}
			
 
				+
			
 
				+static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
			
 
				+{
			
 
				+	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
			
 
				+}
			
 
				+
			
 
				+static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
			
 
				+{
			
 
				+	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
			
 
				+}
			
 
				+
			
 
				+static void __update_writeback_sentinels(struct smq_policy *mq)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+	struct queue *q = &mq->dirty;
			
 
				+	struct entry *sentinel;
			
 
				+
			
 
				+	for (level = 0; level < q->nr_levels; level++) {
			
 
				+		sentinel = writeback_sentinel(mq, level);
			
 
				+		q_del(q, sentinel);
			
 
				+		q_push(q, sentinel);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void __update_demote_sentinels(struct smq_policy *mq)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+	struct queue *q = &mq->clean;
			
 
				+	struct entry *sentinel;
			
 
				+
			
 
				+	for (level = 0; level < q->nr_levels; level++) {
			
 
				+		sentinel = demote_sentinel(mq, level);
			
 
				+		q_del(q, sentinel);
			
 
				+		q_push(q, sentinel);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void update_sentinels(struct smq_policy *mq)
			
 
				+{
			
 
				+	if (time_after(jiffies, mq->next_writeback_period)) {
			
 
				+		__update_writeback_sentinels(mq);
			
 
				+		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
			
 
				+		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
			
 
				+	}
			
 
				+
			
 
				+	if (time_after(jiffies, mq->next_demote_period)) {
			
 
				+		__update_demote_sentinels(mq);
			
 
				+		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
			
 
				+		mq->current_demote_sentinels = !mq->current_demote_sentinels;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void __sentinels_init(struct smq_policy *mq)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+	struct entry *sentinel;
			
 
				+
			
 
				+	for (level = 0; level < NR_CACHE_LEVELS; level++) {
			
 
				+		sentinel = writeback_sentinel(mq, level);
			
 
				+		sentinel->level = level;
			
 
				+		q_push(&mq->dirty, sentinel);
			
 
				+
			
 
				+		sentinel = demote_sentinel(mq, level);
			
 
				+		sentinel->level = level;
			
 
				+		q_push(&mq->clean, sentinel);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void sentinels_init(struct smq_policy *mq)
			
 
				+{
			
 
				+	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
			
 
				+	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
			
 
				+
			
 
				+	mq->current_writeback_sentinels = false;
			
 
				+	mq->current_demote_sentinels = false;
			
 
				+	__sentinels_init(mq);
			
 
				+
			
 
				+	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
			
 
				+	mq->current_demote_sentinels = !mq->current_demote_sentinels;
			
 
				+	__sentinels_init(mq);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * These methods tie together the dirty queue, clean queue and hash table.
			
 
				+ */
			
 
				+static void push_new(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
			
 
				+	h_insert(&mq->table, e);
			
 
				+	q_push(q, e);
			
 
				+}
			
 
				+
			
 
				+static void push(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	struct entry *sentinel;
			
 
				+
			
 
				+	h_insert(&mq->table, e);
			
 
				+
			
 
				+	/*
			
 
				+	 * Punch this into the queue just in front of the sentinel, to
			
 
				+	 * ensure it's cleaned straight away.
			
 
				+	 */
			
 
				+	if (e->dirty) {
			
 
				+		sentinel = writeback_sentinel(mq, e->level);
			
 
				+		q_push_before(&mq->dirty, sentinel, e);
			
 
				+	} else {
			
 
				+		sentinel = demote_sentinel(mq, e->level);
			
 
				+		q_push_before(&mq->clean, sentinel, e);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Removes an entry from cache.  Removes from the hash table.
			
 
				+ */
			
 
				+static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
			
 
				+{
			
 
				+	q_del(q, e);
			
 
				+	h_remove(&mq->table, e);
			
 
				+}
			
 
				+
			
 
				+static void del(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	__del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
			
 
				+}
			
 
				+
			
 
				+static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
			
 
				+{
			
 
				+	struct entry *e = q_pop_old(q, max_level);
			
 
				+	if (e)
			
 
				+		h_remove(&mq->table, e);
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	return to_cblock(get_index(&mq->cache_alloc, e));
			
 
				+}
			
 
				+
			
 
				+static void requeue(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	struct entry *sentinel;
			
 
				+
			
 
				+	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
			
 
				+		if (e->dirty) {
			
 
				+			sentinel = writeback_sentinel(mq, e->level);
			
 
				+			q_requeue_before(&mq->dirty, sentinel, e, 1u);
			
 
				+		} else {
			
 
				+			sentinel = demote_sentinel(mq, e->level);
			
 
				+			q_requeue_before(&mq->clean, sentinel, e, 1u);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned default_promote_level(struct smq_policy *mq)
			
 
				+{
			
 
				+	/*
			
 
				+	 * The promote level depends on the current performance of the
			
 
				+	 * cache.
			
 
				+	 *
			
 
				+	 * If the cache is performing badly, then we can't afford
			
 
				+	 * to promote much without causing performance to drop below that
			
 
				+	 * of the origin device.
			
 
				+	 *
			
 
				+	 * If the cache is performing well, then we don't need to promote
			
 
				+	 * much.  If it isn't broken, don't fix it.
			
 
				+	 *
			
 
				+	 * If the cache is middling then we promote more.
			
 
				+	 *
			
 
				+	 * This scheme reminds me of a graph of entropy vs probability of a
			
 
				+	 * binary variable.
			
 
				+	 */
			
 
				+	static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
			
 
				+
			
 
				+	unsigned hits = mq->cache_stats.hits;
			
 
				+	unsigned misses = mq->cache_stats.misses;
			
 
				+	unsigned index = safe_div(hits << 4u, hits + misses);
			
 
				+	return table[index];
			
 
				+}
			
 
				+
			
 
				+static void update_promote_levels(struct smq_policy *mq)
			
 
				+{
			
 
				+	/*
			
 
				+	 * If there are unused cache entries then we want to be really
			
 
				+	 * eager to promote.
			
 
				+	 */
			
 
				+	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
			
 
				+		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
			
 
				+
			
 
				+	/*
			
 
				+	 * If the hotspot queue is performing badly then we have little
			
 
				+	 * confidence that we know which blocks to promote.  So we cut down
			
 
				+	 * the amount of promotions.
			
 
				+	 */
			
 
				+	switch (stats_assess(&mq->hotspot_stats)) {
			
 
				+	case Q_POOR:
			
 
				+		threshold_level /= 4u;
			
 
				+		break;
			
 
				+
			
 
				+	case Q_FAIR:
			
 
				+		threshold_level /= 2u;
			
 
				+		break;
			
 
				+
			
 
				+	case Q_WELL:
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
			
 
				+	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If the hotspot queue is performing badly, then we try and move entries
			
 
				+ * around more quickly.
			
 
				+ */
			
 
				+static void update_level_jump(struct smq_policy *mq)
			
 
				+{
			
 
				+	switch (stats_assess(&mq->hotspot_stats)) {
			
 
				+	case Q_POOR:
			
 
				+		mq->hotspot_level_jump = 4u;
			
 
				+		break;
			
 
				+
			
 
				+	case Q_FAIR:
			
 
				+		mq->hotspot_level_jump = 2u;
			
 
				+		break;
			
 
				+
			
 
				+	case Q_WELL:
			
 
				+		mq->hotspot_level_jump = 1u;
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void end_hotspot_period(struct smq_policy *mq)
			
 
				+{
			
 
				+	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
			
 
				+	update_promote_levels(mq);
			
 
				+
			
 
				+	if (time_after(jiffies, mq->next_hotspot_period)) {
			
 
				+		update_level_jump(mq);
			
 
				+		q_redistribute(&mq->hotspot);
			
 
				+		stats_reset(&mq->hotspot_stats);
			
 
				+		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void end_cache_period(struct smq_policy *mq)
			
 
				+{
			
 
				+	if (time_after(jiffies, mq->next_cache_period)) {
			
 
				+		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
			
 
				+
			
 
				+		q_redistribute(&mq->dirty);
			
 
				+		q_redistribute(&mq->clean);
			
 
				+		stats_reset(&mq->cache_stats);
			
 
				+
			
 
				+		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int demote_cblock(struct smq_policy *mq,
			
 
				+			 struct policy_locker *locker,
			
 
				+			 dm_oblock_t *oblock)
			
 
				+{
			
 
				+	struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
			
 
				+	if (!demoted)
			
 
				+		/*
			
 
				+		 * We could get a block from mq->dirty, but that
			
 
				+		 * would add extra latency to the triggering bio as it
			
 
				+		 * waits for the writeback.  Better to not promote this
			
 
				+		 * time and hope there's a clean block next time this block
			
 
				+		 * is hit.
			
 
				+		 */
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	if (locker->fn(locker, demoted->oblock))
			
 
				+		/*
			
 
				+		 * We couldn't lock this block.
			
 
				+		 */
			
 
				+		return -EBUSY;
			
 
				+
			
 
				+	del(mq, demoted);
			
 
				+	*oblock = demoted->oblock;
			
 
				+	free_entry(&mq->cache_alloc, demoted);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+enum promote_result {
			
 
				+	PROMOTE_NOT,
			
 
				+	PROMOTE_TEMPORARY,
			
 
				+	PROMOTE_PERMANENT
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Converts a boolean into a promote result.
			
 
				+ */
			
 
				+static enum promote_result maybe_promote(bool promote)
			
 
				+{
			
 
				+	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
			
 
				+}
			
 
				+
			
 
				+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
			
 
				+					  bool fast_promote)
			
 
				+{
			
 
				+	if (bio_data_dir(bio) == WRITE) {
			
 
				+		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
			
 
				+			return PROMOTE_TEMPORARY;
			
 
				+
			
 
				+		else
			
 
				+			return maybe_promote(hs_e->level >= mq->write_promote_level);
			
 
				+	} else
			
 
				+		return maybe_promote(hs_e->level >= mq->read_promote_level);
			
 
				+}
			
 
				+
			
 
				+static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
			
 
				+			    struct policy_locker *locker,
			
 
				+			    struct policy_result *result, enum promote_result pr)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if (allocator_empty(&mq->cache_alloc)) {
			
 
				+		result->op = POLICY_REPLACE;
			
 
				+		r = demote_cblock(mq, locker, &result->old_oblock);
			
 
				+		if (r) {
			
 
				+			result->op = POLICY_MISS;
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+	} else
			
 
				+		result->op = POLICY_NEW;
			
 
				+
			
 
				+	e = alloc_entry(&mq->cache_alloc);
			
 
				+	BUG_ON(!e);
			
 
				+	e->oblock = oblock;
			
 
				+
			
 
				+	if (pr == PROMOTE_TEMPORARY)
			
 
				+		push(mq, e);
			
 
				+	else
			
 
				+		push_new(mq, e);
			
 
				+
			
 
				+	result->cblock = infer_cblock(mq, e);
			
 
				+}
			
 
				+
			
 
				+static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
			
 
				+{
			
 
				+	sector_t r = from_oblock(b);
			
 
				+	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
			
 
				+	return to_oblock(r);
			
 
				+}
			
 
				+
			
 
				+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
			
 
				+{
			
 
				+	unsigned hi;
			
 
				+	dm_oblock_t hb = to_hblock(mq, b);
			
 
				+	struct entry *e = h_lookup(&mq->hotspot_table, hb);
			
 
				+
			
 
				+	if (e) {
			
 
				+		stats_level_accessed(&mq->hotspot_stats, e->level);
			
 
				+
			
 
				+		hi = get_index(&mq->hotspot_alloc, e);
			
 
				+		q_requeue(&mq->hotspot, e,
			
 
				+			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
			
 
				+			  0u : mq->hotspot_level_jump);
			
 
				+
			
 
				+	} else {
			
 
				+		stats_miss(&mq->hotspot_stats);
			
 
				+
			
 
				+		e = alloc_entry(&mq->hotspot_alloc);
			
 
				+		if (!e) {
			
 
				+			e = q_pop(&mq->hotspot);
			
 
				+			if (e) {
			
 
				+				h_remove(&mq->hotspot_table, e);
			
 
				+				hi = get_index(&mq->hotspot_alloc, e);
			
 
				+				clear_bit(hi, mq->hotspot_hit_bits);
			
 
				+			}
			
 
				+
			
 
				+		}
			
 
				+
			
 
				+		if (e) {
			
 
				+			e->oblock = hb;
			
 
				+			q_push(&mq->hotspot, e);
			
 
				+			h_insert(&mq->hotspot_table, e);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Looks the oblock up in the hash table, then decides whether to put in
			
 
				+ * pre_cache, or cache etc.
			
 
				+ */
			
 
				+static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
			
 
				+	       bool can_migrate, bool fast_promote,
			
 
				+	       struct policy_locker *locker, struct policy_result *result)
			
 
				+{
			
 
				+	struct entry *e, *hs_e;
			
 
				+	enum promote_result pr;
			
 
				+
			
 
				+	hs_e = update_hotspot_queue(mq, oblock, bio);
			
 
				+
			
 
				+	e = h_lookup(&mq->table, oblock);
			
 
				+	if (e) {
			
 
				+		stats_level_accessed(&mq->cache_stats, e->level);
			
 
				+
			
 
				+		requeue(mq, e);
			
 
				+		result->op = POLICY_HIT;
			
 
				+		result->cblock = infer_cblock(mq, e);
			
 
				+
			
 
				+	} else {
			
 
				+		stats_miss(&mq->cache_stats);
			
 
				+
			
 
				+		pr = should_promote(mq, hs_e, bio, fast_promote);
			
 
				+		if (pr == PROMOTE_NOT)
			
 
				+			result->op = POLICY_MISS;
			
 
				+
			
 
				+		else {
			
 
				+			if (!can_migrate) {
			
 
				+				result->op = POLICY_MISS;
			
 
				+				return -EWOULDBLOCK;
			
 
				+			}
			
 
				+
			
 
				+			insert_in_cache(mq, oblock, locker, result, pr);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
			
 
				+ * description of these.
			
 
				+ */
			
 
				+
			
 
				+static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	return container_of(p, struct smq_policy, policy);
			
 
				+}
			
 
				+
			
 
				+static void smq_destroy(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	h_exit(&mq->hotspot_table);
			
 
				+	h_exit(&mq->table);
			
 
				+	free_bitset(mq->hotspot_hit_bits);
			
 
				+	free_bitset(mq->cache_hit_bits);
			
 
				+	space_exit(&mq->es);
			
 
				+	kfree(mq);
			
 
				+}
			
 
				+
			
 
				+static void copy_tick(struct smq_policy *mq)
			
 
				+{
			
 
				+	unsigned long flags, tick;
			
 
				+
			
 
				+	spin_lock_irqsave(&mq->tick_lock, flags);
			
 
				+	tick = mq->tick_protected;
			
 
				+	if (tick != mq->tick) {
			
 
				+		update_sentinels(mq);
			
 
				+		end_hotspot_period(mq);
			
 
				+		end_cache_period(mq);
			
 
				+		mq->tick = tick;
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(&mq->tick_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static bool maybe_lock(struct smq_policy *mq, bool can_block)
			
 
				+{
			
 
				+	if (can_block) {
			
 
				+		mutex_lock(&mq->lock);
			
 
				+		return true;
			
 
				+	} else
			
 
				+		return mutex_trylock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				+		   bool can_block, bool can_migrate, bool fast_promote,
			
 
				+		   struct bio *bio, struct policy_locker *locker,
			
 
				+		   struct policy_result *result)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	result->op = POLICY_MISS;
			
 
				+
			
 
				+	if (!maybe_lock(mq, can_block))
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	copy_tick(mq);
			
 
				+	r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if (!mutex_trylock(&mq->lock))
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	e = h_lookup(&mq->table, oblock);
			
 
				+	if (e) {
			
 
				+		*cblock = infer_cblock(mq, e);
			
 
				+		r = 0;
			
 
				+	} else
			
 
				+		r = -ENOENT;
			
 
				+
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	e = h_lookup(&mq->table, oblock);
			
 
				+	BUG_ON(!e);
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	e->dirty = set;
			
 
				+	push(mq, e);
			
 
				+}
			
 
				+
			
 
				+static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	__smq_set_clear_dirty(mq, oblock, true);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	__smq_set_clear_dirty(mq, oblock, false);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static int smq_load_mapping(struct dm_cache_policy *p,
			
 
				+			    dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+			    uint32_t hint, bool hint_valid)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				+	e->oblock = oblock;
			
 
				+	e->dirty = false;	/* this gets corrected in a minute */
			
 
				+	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
			
 
				+	push(mq, e);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int smq_save_hints(struct smq_policy *mq, struct queue *q,
			
 
				+			  policy_walk_fn fn, void *context)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned level;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	for (level = 0; level < q->nr_levels; level++)
			
 
				+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
			
 
				+			if (!e->sentinel) {
			
 
				+				r = fn(context, infer_cblock(mq, e),
			
 
				+				       e->oblock, e->level);
			
 
				+				if (r)
			
 
				+					return r;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
			
 
				+			     void *context)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	int r = 0;
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+
			
 
				+	r = smq_save_hints(mq, &mq->clean, fn, context);
			
 
				+	if (!r)
			
 
				+		r = smq_save_hints(mq, &mq->dirty, fn, context);
			
 
				+
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	e = h_lookup(&mq->table, oblock);
			
 
				+	BUG_ON(!e);
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	free_entry(&mq->cache_alloc, e);
			
 
				+}
			
 
				+
			
 
				+static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	__remove_mapping(mq, oblock);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
			
 
				+{
			
 
				+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				+
			
 
				+	if (!e || !e->allocated)
			
 
				+		return -ENODATA;
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	free_entry(&mq->cache_alloc, e);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	r = __remove_cblock(mq, cblock);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#define CLEAN_TARGET_CRITICAL 5u /* percent */
			
 
				+
			
 
				+static bool clean_target_met(struct smq_policy *mq, bool critical)
			
 
				+{
			
 
				+	if (critical) {
			
 
				+		/*
			
 
				+		 * Cache entries may not be populated.  So we're cannot rely on the
			
 
				+		 * size of the clean queue.
			
 
				+		 */
			
 
				+		unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
			
 
				+		unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
			
 
				+
			
 
				+		return nr_clean >= target;
			
 
				+	} else
			
 
				+		return !q_size(&mq->dirty);
			
 
				+}
			
 
				+
			
 
				+static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
			
 
				+				dm_cblock_t *cblock, bool critical_only)
			
 
				+{
			
 
				+	struct entry *e = NULL;
			
 
				+	bool target_met = clean_target_met(mq, critical_only);
			
 
				+
			
 
				+	if (critical_only)
			
 
				+		/*
			
 
				+		 * Always try and keep the bottom level clean.
			
 
				+		 */
			
 
				+		e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
			
 
				+
			
 
				+	else
			
 
				+		e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
			
 
				+
			
 
				+	if (!e)
			
 
				+		return -ENODATA;
			
 
				+
			
 
				+	*oblock = e->oblock;
			
 
				+	*cblock = infer_cblock(mq, e);
			
 
				+	e->dirty = false;
			
 
				+	push_new(mq, e);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
			
 
				+			      dm_cblock_t *cblock, bool critical_only)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	r = __smq_writeback_work(mq, oblock, cblock, critical_only);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __force_mapping(struct smq_policy *mq,
			
 
				+			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+{
			
 
				+	struct entry *e = h_lookup(&mq->table, current_oblock);
			
 
				+
			
 
				+	if (e) {
			
 
				+		del(mq, e);
			
 
				+		e->oblock = new_oblock;
			
 
				+		e->dirty = true;
			
 
				+		push(mq, e);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void smq_force_mapping(struct dm_cache_policy *p,
			
 
				+			      dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	__force_mapping(mq, current_oblock, new_oblock);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static dm_cblock_t smq_residency(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	dm_cblock_t r;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	r = to_cblock(mq->cache_alloc.nr_allocated);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void smq_tick(struct dm_cache_policy *p, bool can_block)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&mq->tick_lock, flags);
			
 
				+	mq->tick_protected++;
			
 
				+	spin_unlock_irqrestore(&mq->tick_lock, flags);
			
 
				+
			
 
				+	if (can_block) {
			
 
				+		mutex_lock(&mq->lock);
			
 
				+		copy_tick(mq);
			
 
				+		mutex_unlock(&mq->lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Init the policy plugin interface function pointers. */
			
 
				+static void init_policy_functions(struct smq_policy *mq)
			
 
				+{
			
 
				+	mq->policy.destroy = smq_destroy;
			
 
				+	mq->policy.map = smq_map;
			
 
				+	mq->policy.lookup = smq_lookup;
			
 
				+	mq->policy.set_dirty = smq_set_dirty;
			
 
				+	mq->policy.clear_dirty = smq_clear_dirty;
			
 
				+	mq->policy.load_mapping = smq_load_mapping;
			
 
				+	mq->policy.walk_mappings = smq_walk_mappings;
			
 
				+	mq->policy.remove_mapping = smq_remove_mapping;
			
 
				+	mq->policy.remove_cblock = smq_remove_cblock;
			
 
				+	mq->policy.writeback_work = smq_writeback_work;
			
 
				+	mq->policy.force_mapping = smq_force_mapping;
			
 
				+	mq->policy.residency = smq_residency;
			
 
				+	mq->policy.tick = smq_tick;
			
 
				+}
			
 
				+
			
 
				+static bool too_many_hotspot_blocks(sector_t origin_size,
			
 
				+				    sector_t hotspot_block_size,
			
 
				+				    unsigned nr_hotspot_blocks)
			
 
				+{
			
 
				+	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
			
 
				+}
			
 
				+
			
 
				+static void calc_hotspot_params(sector_t origin_size,
			
 
				+				sector_t cache_block_size,
			
 
				+				unsigned nr_cache_blocks,
			
 
				+				sector_t *hotspot_block_size,
			
 
				+				unsigned *nr_hotspot_blocks)
			
 
				+{
			
 
				+	*hotspot_block_size = cache_block_size * 16u;
			
 
				+	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
			
 
				+
			
 
				+	while ((*hotspot_block_size > cache_block_size) &&
			
 
				+	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
			
 
				+		*hotspot_block_size /= 2u;
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
			
 
				+					  sector_t origin_size,
			
 
				+					  sector_t cache_block_size)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
			
 
				+	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
			
 
				+	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
			
 
				+
			
 
				+	if (!mq)
			
 
				+		return NULL;
			
 
				+
			
 
				+	init_policy_functions(mq);
			
 
				+	mq->cache_size = cache_size;
			
 
				+	mq->cache_block_size = cache_block_size;
			
 
				+
			
 
				+	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
			
 
				+			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
			
 
				+
			
 
				+	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
			
 
				+	mq->hotspot_level_jump = 1u;
			
 
				+	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
			
 
				+		DMERR("couldn't initialize entry space");
			
 
				+		goto bad_pool_init;
			
 
				+	}
			
 
				+
			
 
				+	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
			
 
				+        for (i = 0; i < nr_sentinels_per_queue; i++)
			
 
				+		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
			
 
				+
			
 
				+	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
			
 
				+        for (i = 0; i < nr_sentinels_per_queue; i++)
			
 
				+		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
			
 
				+
			
 
				+	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
			
 
				+		       total_sentinels + mq->nr_hotspot_blocks);
			
 
				+
			
 
				+	init_allocator(&mq->cache_alloc, &mq->es,
			
 
				+		       total_sentinels + mq->nr_hotspot_blocks,
			
 
				+		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
			
 
				+
			
 
				+	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
			
 
				+	if (!mq->hotspot_hit_bits) {
			
 
				+		DMERR("couldn't allocate hotspot hit bitset");
			
 
				+		goto bad_hotspot_hit_bits;
			
 
				+	}
			
 
				+	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
			
 
				+
			
 
				+	if (from_cblock(cache_size)) {
			
 
				+		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
			
 
				+		if (!mq->cache_hit_bits && mq->cache_hit_bits) {
			
 
				+			DMERR("couldn't allocate cache hit bitset");
			
 
				+			goto bad_cache_hit_bits;
			
 
				+		}
			
 
				+		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
			
 
				+	} else
			
 
				+		mq->cache_hit_bits = NULL;
			
 
				+
			
 
				+	mq->tick_protected = 0;
			
 
				+	mq->tick = 0;
			
 
				+	mutex_init(&mq->lock);
			
 
				+	spin_lock_init(&mq->tick_lock);
			
 
				+
			
 
				+	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
			
 
				+	mq->hotspot.nr_top_levels = 8;
			
 
				+	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
			
 
				+					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
			
 
				+
			
 
				+	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
			
 
				+	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
			
 
				+
			
 
				+	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
			
 
				+	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
			
 
				+
			
 
				+	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
			
 
				+		goto bad_alloc_table;
			
 
				+
			
 
				+	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
			
 
				+		goto bad_alloc_hotspot_table;
			
 
				+
			
 
				+	sentinels_init(mq);
			
 
				+	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
			
 
				+
			
 
				+	mq->next_hotspot_period = jiffies;
			
 
				+	mq->next_cache_period = jiffies;
			
 
				+
			
 
				+	return &mq->policy;
			
 
				+
			
 
				+bad_alloc_hotspot_table:
			
 
				+	h_exit(&mq->table);
			
 
				+bad_alloc_table:
			
 
				+	free_bitset(mq->cache_hit_bits);
			
 
				+bad_cache_hit_bits:
			
 
				+	free_bitset(mq->hotspot_hit_bits);
			
 
				+bad_hotspot_hit_bits:
			
 
				+	space_exit(&mq->es);
			
 
				+bad_pool_init:
			
 
				+	kfree(mq);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct dm_cache_policy_type smq_policy_type = {
			
 
				+	.name = "smq",
			
 
				+	.version = {1, 0, 0},
			
 
				+	.hint_size = 4,
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.create = smq_create
			
 
				+};
			
 
				+
			
 
				+static struct dm_cache_policy_type default_policy_type = {
			
 
				+	.name = "default",
			
 
				+	.version = {1, 0, 0},
			
 
				+	.hint_size = 4,
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.create = smq_create,
			
 
				+	.real = &smq_policy_type
			
 
				+};
			
 
				+
			
 
				+static int __init smq_init(void)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = dm_cache_policy_register(&smq_policy_type);
			
 
				+	if (r) {
			
 
				+		DMERR("register failed %d", r);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_cache_policy_register(&default_policy_type);
			
 
				+	if (r) {
			
 
				+		DMERR("register failed (as default) %d", r);
			
 
				+		dm_cache_policy_unregister(&smq_policy_type);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void __exit smq_exit(void)
			
 
				+{
			
 
				+	dm_cache_policy_unregister(&smq_policy_type);
			
 
				+	dm_cache_policy_unregister(&default_policy_type);
			
 
				+}
			
 
				+
			
 
				+module_init(smq_init);
			
 
				+module_exit(smq_exit);
			
 
				+
			
 
				+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
			
 
				+MODULE_LICENSE("GPL");
			
 
				+MODULE_DESCRIPTION("smq cache policy");
			
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -69,6 +69,18 @@ enum policy_operation {
 
				 	POLICY_REPLACE
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * When issuing a POLICY_REPLACE the policy needs to make a callback to
			
 
				+ * lock the block being demoted.  This doesn't need to occur during a
			
 
				+ * writeback operation since the block remains in the cache.
			
 
				+ */
			
 
				+struct policy_locker;
			
 
				+typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
			
 
				+
			
 
				+struct policy_locker {
			
 
				+	policy_lock_fn fn;
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				  * This is the instruction passed back to the core target.
			
 
				  */
			
@@ -122,7 +134,8 @@ struct dm_cache_policy {
 
				 	 */
			
 
				 	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				 		   bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-		   struct bio *bio, struct policy_result *result);
			
 
				+		   struct bio *bio, struct policy_locker *locker,
			
 
				+		   struct policy_result *result);
			
 
				 
			
 
				 	/*
			
 
				 	 * Sometimes we want to see if a block is in the cache, without
			
@@ -165,7 +178,9 @@ struct dm_cache_policy {
 
				 	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				 
			
 
				 	/*
			
 
				-	 * Provide a dirty block to be written back by the core target.
			
 
				+	 * Provide a dirty block to be written back by the core target.  If
			
 
				+	 * critical_only is set then the policy should only provide work if
			
 
				+	 * it urgently needs it.
			
 
				 	 *
			
 
				 	 * Returns:
			
 
				 	 *
			
@@ -173,7 +188,8 @@ struct dm_cache_policy {
 
				 	 *
			
 
				 	 * -ENODATA: no dirty blocks available
			
 
				 	 */
			
 
				-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
			
 
				+	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
			
 
				+			      bool critical_only);
			
 
				 
			
 
				 	/*
			
 
				 	 * How full is the cache?
			
@@ -184,16 +200,16 @@ struct dm_cache_policy {
 
				 	 * Because of where we sit in the block layer, we can be asked to
			
 
				 	 * map a lot of little bios that are all in the same block (no
			
 
				 	 * queue merging has occurred).  To stop the policy being fooled by
			
 
				-	 * these the core target sends regular tick() calls to the policy.
			
 
				+	 * these, the core target sends regular tick() calls to the policy.
			
 
				 	 * The policy should only count an entry as hit once per tick.
			
 
				 	 */
			
 
				-	void (*tick)(struct dm_cache_policy *p);
			
 
				+	void (*tick)(struct dm_cache_policy *p, bool can_block);
			
 
				 
			
 
				 	/*
			
 
				 	 * Configuration.
			
 
				 	 */
			
 
				-	int (*emit_config_values)(struct dm_cache_policy *p,
			
 
				-				  char *result, unsigned maxlen);
			
 
				+	int (*emit_config_values)(struct dm_cache_policy *p, char *result,
			
 
				+				  unsigned maxlen, ssize_t *sz_ptr);
			
 
				 	int (*set_config_value)(struct dm_cache_policy *p,
			
 
				 				const char *key, const char *value);
			
 
				 
			
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -25,43 +25,92 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-/*
			
 
				- * Glossary:
			
 
				- *
			
 
				- * oblock: index of an origin block
			
 
				- * cblock: index of a cache block
			
 
				- * promotion: movement of a block from origin to cache
			
 
				- * demotion: movement of a block from cache to origin
			
 
				- * migration: movement of a block between the origin and cache device,
			
 
				- *	      either direction
			
 
				- */
			
 
				+#define IOT_RESOLUTION 4
			
 
				 
			
 
				-/*----------------------------------------------------------------*/
			
 
				+struct io_tracker {
			
 
				+	spinlock_t lock;
			
 
				 
			
 
				-static size_t bitset_size_in_bytes(unsigned nr_entries)
			
 
				+	/*
			
 
				+	 * Sectors of in-flight IO.
			
 
				+	 */
			
 
				+	sector_t in_flight;
			
 
				+
			
 
				+	/*
			
 
				+	 * The time, in jiffies, when this device became idle (if it is
			
 
				+	 * indeed idle).
			
 
				+	 */
			
 
				+	unsigned long idle_time;
			
 
				+	unsigned long last_update_time;
			
 
				+};
			
 
				+
			
 
				+static void iot_init(struct io_tracker *iot)
			
 
				 {
			
 
				-	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
			
 
				+	spin_lock_init(&iot->lock);
			
 
				+	iot->in_flight = 0ul;
			
 
				+	iot->idle_time = 0ul;
			
 
				+	iot->last_update_time = jiffies;
			
 
				 }
			
 
				 
			
 
				-static unsigned long *alloc_bitset(unsigned nr_entries)
			
 
				+static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
			
 
				 {
			
 
				-	size_t s = bitset_size_in_bytes(nr_entries);
			
 
				-	return vzalloc(s);
			
 
				+	if (iot->in_flight)
			
 
				+		return false;
			
 
				+
			
 
				+	return time_after(jiffies, iot->idle_time + jifs);
			
 
				 }
			
 
				 
			
 
				-static void clear_bitset(void *bitset, unsigned nr_entries)
			
 
				+static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
			
 
				 {
			
 
				-	size_t s = bitset_size_in_bytes(nr_entries);
			
 
				-	memset(bitset, 0, s);
			
 
				+	bool r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&iot->lock, flags);
			
 
				+	r = __iot_idle_for(iot, jifs);
			
 
				+	spin_unlock_irqrestore(&iot->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static void free_bitset(unsigned long *bits)
			
 
				+static void iot_io_begin(struct io_tracker *iot, sector_t len)
			
 
				 {
			
 
				-	vfree(bits);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&iot->lock, flags);
			
 
				+	iot->in_flight += len;
			
 
				+	spin_unlock_irqrestore(&iot->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void __iot_io_end(struct io_tracker *iot, sector_t len)
			
 
				+{
			
 
				+	iot->in_flight -= len;
			
 
				+	if (!iot->in_flight)
			
 
				+		iot->idle_time = jiffies;
			
 
				+}
			
 
				+
			
 
				+static void iot_io_end(struct io_tracker *iot, sector_t len)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&iot->lock, flags);
			
 
				+	__iot_io_end(iot, len);
			
 
				+	spin_unlock_irqrestore(&iot->lock, flags);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				+/*
			
 
				+ * Glossary:
			
 
				+ *
			
 
				+ * oblock: index of an origin block
			
 
				+ * cblock: index of a cache block
			
 
				+ * promotion: movement of a block from origin to cache
			
 
				+ * demotion: movement of a block from cache to origin
			
 
				+ * migration: movement of a block between the origin and cache device,
			
 
				+ *	      either direction
			
 
				+ */
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				 /*
			
 
				  * There are a couple of places where we let a bio run, but want to do some
			
 
				  * work before calling its endio function.  We do this by temporarily
			
@@ -101,12 +150,10 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 
				 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
			
 
				 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
			
 
				 
			
 
				-/*
			
 
				- * FIXME: the cache is read/write for the time being.
			
 
				- */
			
 
				 enum cache_metadata_mode {
			
 
				 	CM_WRITE,		/* metadata may be changed */
			
 
				 	CM_READ_ONLY,		/* metadata may not be changed */
			
 
				+	CM_FAIL
			
 
				 };
			
 
				 
			
 
				 enum cache_io_mode {
			
@@ -208,6 +255,7 @@ struct cache {
 
				 	int sectors_per_block_shift;
			
 
				 
			
 
				 	spinlock_t lock;
			
 
				+	struct list_head deferred_cells;
			
 
				 	struct bio_list deferred_bios;
			
 
				 	struct bio_list deferred_flush_bios;
			
 
				 	struct bio_list deferred_writethrough_bios;
			
@@ -282,6 +330,8 @@ struct cache {
 
				 	 */
			
 
				 	spinlock_t invalidation_lock;
			
 
				 	struct list_head invalidation_requests;
			
 
				+
			
 
				+	struct io_tracker origin_tracker;
			
 
				 };
			
 
				 
			
 
				 struct per_bio_data {
			
@@ -289,6 +339,7 @@ struct per_bio_data {
 
				 	unsigned req_nr:2;
			
 
				 	struct dm_deferred_entry *all_io_entry;
			
 
				 	struct dm_hook_info hook_info;
			
 
				+	sector_t len;
			
 
				 
			
 
				 	/*
			
 
				 	 * writethrough fields.  These MUST remain at the end of this
			
@@ -332,6 +383,8 @@ struct prealloc {
 
				 	struct dm_bio_prison_cell *cell2;
			
 
				 };
			
 
				 
			
 
				+static enum cache_metadata_mode get_cache_mode(struct cache *cache);
			
 
				+
			
 
				 static void wake_worker(struct cache *cache)
			
 
				 {
			
 
				 	queue_work(cache->wq, &cache->worker);
			
@@ -365,10 +418,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
 
				 
			
 
				 static void free_migration(struct dm_cache_migration *mg)
			
 
				 {
			
 
				-	if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
			
 
				-		wake_up(&mg->cache->migration_wait);
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
			
 
				+		wake_up(&cache->migration_wait);
			
 
				 
			
 
				-	mempool_free(mg, mg->cache->migration_pool);
			
 
				+	mempool_free(mg, cache->migration_pool);
			
 
				+	wake_worker(cache);
			
 
				 }
			
 
				 
			
 
				 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
			
@@ -643,6 +699,9 @@ static void save_stats(struct cache *cache)
 
				 {
			
 
				 	struct dm_cache_statistics stats;
			
 
				 
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return;
			
 
				+
			
 
				 	stats.read_hits = atomic_read(&cache->stats.read_hit);
			
 
				 	stats.read_misses = atomic_read(&cache->stats.read_miss);
			
 
				 	stats.write_hits = atomic_read(&cache->stats.write_hit);
			
@@ -695,6 +754,7 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 
				 	pb->tick = false;
			
 
				 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
			
 
				 	pb->all_io_entry = NULL;
			
 
				+	pb->len = 0;
			
 
				 
			
 
				 	return pb;
			
 
				 }
			
@@ -792,12 +852,43 @@ static void inc_ds(struct cache *cache, struct bio *bio,
 
				 	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
			
 
				 }
			
 
				 
			
 
				+static bool accountable_bio(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	return ((bio->bi_bdev == cache->origin_dev->bdev) &&
			
 
				+		!(bio->bi_rw & REQ_DISCARD));
			
 
				+}
			
 
				+
			
 
				+static void accounted_begin(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				+
			
 
				+	if (accountable_bio(cache, bio)) {
			
 
				+		pb->len = bio_sectors(bio);
			
 
				+		iot_io_begin(&cache->origin_tracker, pb->len);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void accounted_complete(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				+
			
 
				+	iot_io_end(&cache->origin_tracker, pb->len);
			
 
				+}
			
 
				+
			
 
				+static void accounted_request(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	accounted_begin(cache, bio);
			
 
				+	generic_make_request(bio);
			
 
				+}
			
 
				+
			
 
				 static void issue(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
 
				 	if (!bio_triggers_commit(cache, bio)) {
			
 
				-		generic_make_request(bio);
			
 
				+		accounted_request(cache, bio);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -869,6 +960,94 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 
				 	remap_to_origin_clear_discard(pb->cache, bio, oblock);
			
 
				 }
			
 
				 
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Failure modes
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static enum cache_metadata_mode get_cache_mode(struct cache *cache)
			
 
				+{
			
 
				+	return cache->features.mode;
			
 
				+}
			
 
				+
			
 
				+static const char *cache_device_name(struct cache *cache)
			
 
				+{
			
 
				+	return dm_device_name(dm_table_get_md(cache->ti->table));
			
 
				+}
			
 
				+
			
 
				+static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
			
 
				+{
			
 
				+	const char *descs[] = {
			
 
				+		"write",
			
 
				+		"read-only",
			
 
				+		"fail"
			
 
				+	};
			
 
				+
			
 
				+	dm_table_event(cache->ti->table);
			
 
				+	DMINFO("%s: switching cache to %s mode",
			
 
				+	       cache_device_name(cache), descs[(int)mode]);
			
 
				+}
			
 
				+
			
 
				+static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
			
 
				+{
			
 
				+	bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
			
 
				+	enum cache_metadata_mode old_mode = get_cache_mode(cache);
			
 
				+
			
 
				+	if (new_mode == CM_WRITE && needs_check) {
			
 
				+		DMERR("%s: unable to switch cache to write mode until repaired.",
			
 
				+		      cache_device_name(cache));
			
 
				+		if (old_mode != new_mode)
			
 
				+			new_mode = old_mode;
			
 
				+		else
			
 
				+			new_mode = CM_READ_ONLY;
			
 
				+	}
			
 
				+
			
 
				+	/* Never move out of fail mode */
			
 
				+	if (old_mode == CM_FAIL)
			
 
				+		new_mode = CM_FAIL;
			
 
				+
			
 
				+	switch (new_mode) {
			
 
				+	case CM_FAIL:
			
 
				+	case CM_READ_ONLY:
			
 
				+		dm_cache_metadata_set_read_only(cache->cmd);
			
 
				+		break;
			
 
				+
			
 
				+	case CM_WRITE:
			
 
				+		dm_cache_metadata_set_read_write(cache->cmd);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	cache->features.mode = new_mode;
			
 
				+
			
 
				+	if (new_mode != old_mode)
			
 
				+		notify_mode_switch(cache, new_mode);
			
 
				+}
			
 
				+
			
 
				+static void abort_transaction(struct cache *cache)
			
 
				+{
			
 
				+	const char *dev_name = cache_device_name(cache);
			
 
				+
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return;
			
 
				+
			
 
				+	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
			
 
				+		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
			
 
				+		set_cache_mode(cache, CM_FAIL);
			
 
				+	}
			
 
				+
			
 
				+	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
			
 
				+	if (dm_cache_metadata_abort(cache->cmd)) {
			
 
				+		DMERR("%s: failed to abort metadata transaction", dev_name);
			
 
				+		set_cache_mode(cache, CM_FAIL);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void metadata_operation_failed(struct cache *cache, const char *op, int r)
			
 
				+{
			
 
				+	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
			
 
				+		    cache_device_name(cache), op, r);
			
 
				+	abort_transaction(cache);
			
 
				+	set_cache_mode(cache, CM_READ_ONLY);
			
 
				+}
			
 
				+
			
 
				 /*----------------------------------------------------------------
			
 
				  * Migration processing
			
 
				  *
			
@@ -885,26 +1064,63 @@ static void dec_io_migrations(struct cache *cache)
 
				 	atomic_dec(&cache->nr_io_migrations);
			
 
				 }
			
 
				 
			
 
				-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				-			 bool holder)
			
 
				+static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				+			   bool holder, struct bio_list *bios)
			
 
				 {
			
 
				 	(holder ? dm_cell_release : dm_cell_release_no_holder)
			
 
				-		(cache->prison, cell, &cache->deferred_bios);
			
 
				+		(cache->prison, cell, bios);
			
 
				 	free_prison_cell(cache, cell);
			
 
				 }
			
 
				 
			
 
				-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				-		       bool holder)
			
 
				+static bool discard_or_flush(struct bio *bio)
			
 
				+{
			
 
				+	return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
			
 
				+}
			
 
				+
			
 
				+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	if (discard_or_flush(cell->holder))
			
 
				+		/*
			
 
				+		 * We have to handle these bios
			
 
				+		 * individually.
			
 
				+		 */
			
 
				+		__cell_release(cache, cell, true, &cache->deferred_bios);
			
 
				+
			
 
				+	else
			
 
				+		list_add_tail(&cell->user_list, &cache->deferred_cells);
			
 
				+}
			
 
				+
			
 
				+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
 
				+	if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
			
 
				+		/*
			
 
				+		 * There was no prisoner to promote to holder, the
			
 
				+		 * cell has been released.
			
 
				+		 */
			
 
				+		free_prison_cell(cache, cell);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	__cell_defer(cache, cell, holder);
			
 
				+	__cell_defer(cache, cell);
			
 
				 	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				 
			
 
				 	wake_worker(cache);
			
 
				 }
			
 
				 
			
 
				+static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
			
 
				+{
			
 
				+	dm_cell_error(cache->prison, cell, err);
			
 
				+	dm_bio_prison_free_cell(cache->prison, cell);
			
 
				+}
			
 
				+
			
 
				+static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
			
 
				+}
			
 
				+
			
 
				 static void free_io_migration(struct dm_cache_migration *mg)
			
 
				 {
			
 
				 	dec_io_migrations(mg->cache);
			
@@ -914,21 +1130,22 @@ static void free_io_migration(struct dm_cache_migration *mg)
 
				 static void migration_failure(struct dm_cache_migration *mg)
			
 
				 {
			
 
				 	struct cache *cache = mg->cache;
			
 
				+	const char *dev_name = cache_device_name(cache);
			
 
				 
			
 
				 	if (mg->writeback) {
			
 
				-		DMWARN_LIMIT("writeback failed; couldn't copy block");
			
 
				+		DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
			
 
				 		set_dirty(cache, mg->old_oblock, mg->cblock);
			
 
				 		cell_defer(cache, mg->old_ocell, false);
			
 
				 
			
 
				 	} else if (mg->demote) {
			
 
				-		DMWARN_LIMIT("demotion failed; couldn't copy block");
			
 
				+		DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
			
 
				 		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
			
 
				 
			
 
				 		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
			
 
				 		if (mg->promote)
			
 
				 			cell_defer(cache, mg->new_ocell, true);
			
 
				 	} else {
			
 
				-		DMWARN_LIMIT("promotion failed; couldn't copy block");
			
 
				+		DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
			
 
				 		policy_remove_mapping(cache->policy, mg->new_oblock);
			
 
				 		cell_defer(cache, mg->new_ocell, true);
			
 
				 	}
			
@@ -938,6 +1155,7 @@ static void migration_failure(struct dm_cache_migration *mg)
 
				 
			
 
				 static void migration_success_pre_commit(struct dm_cache_migration *mg)
			
 
				 {
			
 
				+	int r;
			
 
				 	unsigned long flags;
			
 
				 	struct cache *cache = mg->cache;
			
 
				 
			
@@ -948,8 +1166,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 
				 		return;
			
 
				 
			
 
				 	} else if (mg->demote) {
			
 
				-		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
			
 
				-			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
			
 
				+		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
			
 
				+				    cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
			
 
				 			policy_force_mapping(cache->policy, mg->new_oblock,
			
 
				 					     mg->old_oblock);
			
 
				 			if (mg->promote)
			
@@ -958,8 +1179,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 
				 			return;
			
 
				 		}
			
 
				 	} else {
			
 
				-		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
			
 
				-			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
			
 
				+		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
			
 
				+				    cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
			
 
				 			policy_remove_mapping(cache->policy, mg->new_oblock);
			
 
				 			free_io_migration(mg);
			
 
				 			return;
			
@@ -978,7 +1202,8 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
 
				 	struct cache *cache = mg->cache;
			
 
				 
			
 
				 	if (mg->writeback) {
			
 
				-		DMWARN("writeback unexpectedly triggered commit");
			
 
				+		DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
			
 
				+			     cache_device_name(cache));
			
 
				 		return;
			
 
				 
			
 
				 	} else if (mg->demote) {
			
@@ -1054,7 +1279,7 @@ static void issue_copy(struct dm_cache_migration *mg)
 
				 	}
			
 
				 
			
 
				 	if (r < 0) {
			
 
				-		DMERR_LIMIT("issuing migration failed");
			
 
				+		DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
			
 
				 		migration_failure(mg);
			
 
				 	}
			
 
				 }
			
@@ -1093,7 +1318,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 
				 	 * No need to inc_ds() here, since the cell will be held for the
			
 
				 	 * duration of the io.
			
 
				 	 */
			
 
				-	generic_make_request(bio);
			
 
				+	accounted_request(mg->cache, bio);
			
 
				 }
			
 
				 
			
 
				 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
			
@@ -1439,32 +1664,154 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 
				 		   &cache->stats.read_miss : &cache->stats.write_miss);
			
 
				 }
			
 
				 
			
 
				-static void process_bio(struct cache *cache, struct prealloc *structs,
			
 
				-			struct bio *bio)
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct inc_detail {
			
 
				+	struct cache *cache;
			
 
				+	struct bio_list bios_for_issue;
			
 
				+	struct bio_list unhandled_bios;
			
 
				+	bool any_writes;
			
 
				+};
			
 
				+
			
 
				+static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct inc_detail *detail = context;
			
 
				+	struct cache *cache = detail->cache;
			
 
				+
			
 
				+	inc_ds(cache, cell->holder, cell);
			
 
				+	if (bio_data_dir(cell->holder) == WRITE)
			
 
				+		detail->any_writes = true;
			
 
				+
			
 
				+	while ((bio = bio_list_pop(&cell->bios))) {
			
 
				+		if (discard_or_flush(bio)) {
			
 
				+			bio_list_add(&detail->unhandled_bios, bio);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (bio_data_dir(bio) == WRITE)
			
 
				+			detail->any_writes = true;
			
 
				+
			
 
				+		bio_list_add(&detail->bios_for_issue, bio);
			
 
				+		inc_ds(cache, bio, cell);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// FIXME: refactor these two
			
 
				+static void remap_cell_to_origin_clear_discard(struct cache *cache,
			
 
				+					       struct dm_bio_prison_cell *cell,
			
 
				+					       dm_oblock_t oblock, bool issue_holder)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	unsigned long flags;
			
 
				+	struct inc_detail detail;
			
 
				+
			
 
				+	detail.cache = cache;
			
 
				+	bio_list_init(&detail.bios_for_issue);
			
 
				+	bio_list_init(&detail.unhandled_bios);
			
 
				+	detail.any_writes = false;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
			
 
				+	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	remap_to_origin(cache, cell->holder);
			
 
				+	if (issue_holder)
			
 
				+		issue(cache, cell->holder);
			
 
				+	else
			
 
				+		accounted_begin(cache, cell->holder);
			
 
				+
			
 
				+	if (detail.any_writes)
			
 
				+		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				+
			
 
				+	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
			
 
				+		remap_to_origin(cache, bio);
			
 
				+		issue(cache, bio);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				+				      dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	unsigned long flags;
			
 
				+	struct inc_detail detail;
			
 
				+
			
 
				+	detail.cache = cache;
			
 
				+	bio_list_init(&detail.bios_for_issue);
			
 
				+	bio_list_init(&detail.unhandled_bios);
			
 
				+	detail.any_writes = false;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
			
 
				+	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	remap_to_cache(cache, cell->holder, cblock);
			
 
				+	if (issue_holder)
			
 
				+		issue(cache, cell->holder);
			
 
				+	else
			
 
				+		accounted_begin(cache, cell->holder);
			
 
				+
			
 
				+	if (detail.any_writes) {
			
 
				+		set_dirty(cache, oblock, cblock);
			
 
				+		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				+	}
			
 
				+
			
 
				+	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
			
 
				+		remap_to_cache(cache, bio, cblock);
			
 
				+		issue(cache, bio);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct old_oblock_lock {
			
 
				+	struct policy_locker locker;
			
 
				+	struct cache *cache;
			
 
				+	struct prealloc *structs;
			
 
				+	struct dm_bio_prison_cell *cell;
			
 
				+};
			
 
				+
			
 
				+static int null_locker(struct policy_locker *locker, dm_oblock_t b)
			
 
				+{
			
 
				+	/* This should never be called */
			
 
				+	BUG();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
			
 
				+{
			
 
				+	struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
			
 
				+	struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
			
 
				+
			
 
				+	return bio_detain(l->cache, b, NULL, cell_prealloc,
			
 
				+			  (cell_free_fn) prealloc_put_cell,
			
 
				+			  l->structs, &l->cell);
			
 
				+}
			
 
				+
			
 
				+static void process_cell(struct cache *cache, struct prealloc *structs,
			
 
				+			 struct dm_bio_prison_cell *new_ocell)
			
 
				 {
			
 
				 	int r;
			
 
				 	bool release_cell = true;
			
 
				+	struct bio *bio = new_ocell->holder;
			
 
				 	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				-	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
			
 
				 	struct policy_result lookup_result;
			
 
				 	bool passthrough = passthrough_mode(&cache->features);
			
 
				-	bool discarded_block, can_migrate;
			
 
				-
			
 
				-	/*
			
 
				-	 * Check to see if that block is currently migrating.
			
 
				-	 */
			
 
				-	cell_prealloc = prealloc_get_cell(structs);
			
 
				-	r = bio_detain(cache, block, bio, cell_prealloc,
			
 
				-		       (cell_free_fn) prealloc_put_cell,
			
 
				-		       structs, &new_ocell);
			
 
				-	if (r > 0)
			
 
				-		return;
			
 
				+	bool fast_promotion, can_migrate;
			
 
				+	struct old_oblock_lock ool;
			
 
				 
			
 
				-	discarded_block = is_discarded_oblock(cache, block);
			
 
				-	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
			
 
				+	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
			
 
				+	can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
			
 
				 
			
 
				-	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
			
 
				-		       bio, &lookup_result);
			
 
				+	ool.locker.fn = cell_locker;
			
 
				+	ool.cache = cache;
			
 
				+	ool.structs = structs;
			
 
				+	ool.cell = NULL;
			
 
				+	r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
			
 
				+		       bio, &ool.locker, &lookup_result);
			
 
				 
			
 
				 	if (r == -EWOULDBLOCK)
			
 
				 		/* migration has been denied */
			
@@ -1500,9 +1847,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
				 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
			
 
				 				inc_and_issue(cache, bio, new_ocell);
			
 
				 
			
 
				-			} else  {
			
 
				-				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
			
 
				-				inc_and_issue(cache, bio, new_ocell);
			
 
				+			} else {
			
 
				+				remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
			
 
				+				release_cell = false;
			
 
				 			}
			
 
				 		}
			
 
				 
			
@@ -1510,8 +1857,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
				 
			
 
				 	case POLICY_MISS:
			
 
				 		inc_miss_counter(cache, bio);
			
 
				-		remap_to_origin_clear_discard(cache, bio, block);
			
 
				-		inc_and_issue(cache, bio, new_ocell);
			
 
				+		remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
			
 
				+		release_cell = false;
			
 
				 		break;
			
 
				 
			
 
				 	case POLICY_NEW:
			
@@ -1521,32 +1868,17 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
				 		break;
			
 
				 
			
 
				 	case POLICY_REPLACE:
			
 
				-		cell_prealloc = prealloc_get_cell(structs);
			
 
				-		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
			
 
				-			       (cell_free_fn) prealloc_put_cell,
			
 
				-			       structs, &old_ocell);
			
 
				-		if (r > 0) {
			
 
				-			/*
			
 
				-			 * We have to be careful to avoid lock inversion of
			
 
				-			 * the cells.  So we back off, and wait for the
			
 
				-			 * old_ocell to become free.
			
 
				-			 */
			
 
				-			policy_force_mapping(cache->policy, block,
			
 
				-					     lookup_result.old_oblock);
			
 
				-			atomic_inc(&cache->stats.cache_cell_clash);
			
 
				-			break;
			
 
				-		}
			
 
				 		atomic_inc(&cache->stats.demotion);
			
 
				 		atomic_inc(&cache->stats.promotion);
			
 
				-
			
 
				 		demote_then_promote(cache, structs, lookup_result.old_oblock,
			
 
				 				    block, lookup_result.cblock,
			
 
				-				    old_ocell, new_ocell);
			
 
				+				    ool.cell, new_ocell);
			
 
				 		release_cell = false;
			
 
				 		break;
			
 
				 
			
 
				 	default:
			
 
				-		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
			
 
				+		DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
			
 
				+			    cache_device_name(cache), __func__,
			
 
				 			    (unsigned) lookup_result.op);
			
 
				 		bio_io_error(bio);
			
 
				 	}
			
@@ -1555,10 +1887,48 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
				 		cell_defer(cache, new_ocell, false);
			
 
				 }
			
 
				 
			
 
				+static void process_bio(struct cache *cache, struct prealloc *structs,
			
 
				+			struct bio *bio)
			
 
				+{
			
 
				+	int r;
			
 
				+	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				+	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check to see if that block is currently migrating.
			
 
				+	 */
			
 
				+	cell_prealloc = prealloc_get_cell(structs);
			
 
				+	r = bio_detain(cache, block, bio, cell_prealloc,
			
 
				+		       (cell_free_fn) prealloc_put_cell,
			
 
				+		       structs, &new_ocell);
			
 
				+	if (r > 0)
			
 
				+		return;
			
 
				+
			
 
				+	process_cell(cache, structs, new_ocell);
			
 
				+}
			
 
				+
			
 
				 static int need_commit_due_to_time(struct cache *cache)
			
 
				 {
			
 
				-	return !time_in_range(jiffies, cache->last_commit_jiffies,
			
 
				-			      cache->last_commit_jiffies + COMMIT_PERIOD);
			
 
				+	return jiffies < cache->last_commit_jiffies ||
			
 
				+	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * A non-zero return indicates read_only or fail_io mode.
			
 
				+ */
			
 
				+static int commit(struct cache *cache, bool clean_shutdown)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	atomic_inc(&cache->stats.commit_count);
			
 
				+	r = dm_cache_commit(cache->cmd, clean_shutdown);
			
 
				+	if (r)
			
 
				+		metadata_operation_failed(cache, "dm_cache_commit", r);
			
 
				+
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				 static int commit_if_needed(struct cache *cache)
			
@@ -1567,9 +1937,8 @@ static int commit_if_needed(struct cache *cache)
 
				 
			
 
				 	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
			
 
				 	    dm_cache_changed_this_transaction(cache->cmd)) {
			
 
				-		atomic_inc(&cache->stats.commit_count);
			
 
				+		r = commit(cache, false);
			
 
				 		cache->commit_requested = false;
			
 
				-		r = dm_cache_commit(cache->cmd, false);
			
 
				 		cache->last_commit_jiffies = jiffies;
			
 
				 	}
			
 
				 
			
@@ -1617,6 +1986,40 @@ static void process_deferred_bios(struct cache *cache)
 
				 	prealloc_free_structs(cache, &structs);
			
 
				 }
			
 
				 
			
 
				+static void process_deferred_cells(struct cache *cache)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct dm_bio_prison_cell *cell, *tmp;
			
 
				+	struct list_head cells;
			
 
				+	struct prealloc structs;
			
 
				+
			
 
				+	memset(&structs, 0, sizeof(structs));
			
 
				+
			
 
				+	INIT_LIST_HEAD(&cells);
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	list_splice_init(&cache->deferred_cells, &cells);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
			
 
				+		/*
			
 
				+		 * If we've got no free migration structs, and processing
			
 
				+		 * this bio might require one, we pause until there are some
			
 
				+		 * prepared mappings to process.
			
 
				+		 */
			
 
				+		if (prealloc_data_structs(cache, &structs)) {
			
 
				+			spin_lock_irqsave(&cache->lock, flags);
			
 
				+			list_splice(&cells, &cache->deferred_cells);
			
 
				+			spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		process_cell(cache, &structs, cell);
			
 
				+	}
			
 
				+
			
 
				+	prealloc_free_structs(cache, &structs);
			
 
				+}
			
 
				+
			
 
				 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
			
 
				 {
			
 
				 	unsigned long flags;
			
@@ -1634,7 +2037,7 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 
				 	 * These bios have already been through inc_ds()
			
 
				 	 */
			
 
				 	while ((bio = bio_list_pop(&bios)))
			
 
				-		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
			
 
				+		submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
			
 
				 }
			
 
				 
			
 
				 static void process_deferred_writethrough_bios(struct cache *cache)
			
@@ -1654,7 +2057,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 
				 	 * These bios have already been through inc_ds()
			
 
				 	 */
			
 
				 	while ((bio = bio_list_pop(&bios)))
			
 
				-		generic_make_request(bio);
			
 
				+		accounted_request(cache, bio);
			
 
				 }
			
 
				 
			
 
				 static void writeback_some_dirty_blocks(struct cache *cache)
			
@@ -1664,6 +2067,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 
				 	dm_cblock_t cblock;
			
 
				 	struct prealloc structs;
			
 
				 	struct dm_bio_prison_cell *old_ocell;
			
 
				+	bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
			
 
				 
			
 
				 	memset(&structs, 0, sizeof(structs));
			
 
				 
			
@@ -1671,7 +2075,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 
				 		if (prealloc_data_structs(cache, &structs))
			
 
				 			break;
			
 
				 
			
 
				-		r = policy_writeback_work(cache->policy, &oblock, &cblock);
			
 
				+		r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
			
 
				 		if (r)
			
 
				 			break;
			
 
				 
			
@@ -1702,15 +2106,17 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
 
				 		r = policy_remove_cblock(cache->policy, to_cblock(begin));
			
 
				 		if (!r) {
			
 
				 			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
			
 
				-			if (r)
			
 
				+			if (r) {
			
 
				+				metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
			
 
				 				break;
			
 
				+			}
			
 
				 
			
 
				 		} else if (r == -ENODATA) {
			
 
				 			/* harmless, already unmapped */
			
 
				 			r = 0;
			
 
				 
			
 
				 		} else {
			
 
				-			DMERR("policy_remove_cblock failed");
			
 
				+			DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -1783,7 +2189,22 @@ static void stop_worker(struct cache *cache)
 
				 	flush_workqueue(cache->wq);
			
 
				 }
			
 
				 
			
 
				-static void requeue_deferred_io(struct cache *cache)
			
 
				+static void requeue_deferred_cells(struct cache *cache)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct list_head cells;
			
 
				+	struct dm_bio_prison_cell *cell, *tmp;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&cells);
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	list_splice_init(&cache->deferred_cells, &cells);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	list_for_each_entry_safe(cell, tmp, &cells, user_list)
			
 
				+		cell_requeue(cache, cell);
			
 
				+}
			
 
				+
			
 
				+static void requeue_deferred_bios(struct cache *cache)
			
 
				 {
			
 
				 	struct bio *bio;
			
 
				 	struct bio_list bios;
			
@@ -1804,6 +2225,7 @@ static int more_work(struct cache *cache)
 
				 			!list_empty(&cache->need_commit_migrations);
			
 
				 	else
			
 
				 		return !bio_list_empty(&cache->deferred_bios) ||
			
 
				+			!list_empty(&cache->deferred_cells) ||
			
 
				 			!bio_list_empty(&cache->deferred_flush_bios) ||
			
 
				 			!bio_list_empty(&cache->deferred_writethrough_bios) ||
			
 
				 			!list_empty(&cache->quiesced_migrations) ||
			
@@ -1821,6 +2243,7 @@ static void do_worker(struct work_struct *ws)
 
				 			writeback_some_dirty_blocks(cache);
			
 
				 			process_deferred_writethrough_bios(cache);
			
 
				 			process_deferred_bios(cache);
			
 
				+			process_deferred_cells(cache);
			
 
				 			process_invalidation_requests(cache);
			
 
				 		}
			
 
				 
			
@@ -1830,11 +2253,6 @@ static void do_worker(struct work_struct *ws)
 
				 		if (commit_if_needed(cache)) {
			
 
				 			process_deferred_flush_bios(cache, false);
			
 
				 			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
			
 
				-
			
 
				-			/*
			
 
				-			 * FIXME: rollback metadata or just go into a
			
 
				-			 * failure mode and error everything
			
 
				-			 */
			
 
				 		} else {
			
 
				 			process_deferred_flush_bios(cache, true);
			
 
				 			process_migrations(cache, &cache->need_commit_migrations,
			
@@ -1853,7 +2271,7 @@ static void do_worker(struct work_struct *ws)
 
				 static void do_waker(struct work_struct *ws)
			
 
				 {
			
 
				 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
			
 
				-	policy_tick(cache->policy);
			
 
				+	policy_tick(cache->policy, true);
			
 
				 	wake_worker(cache);
			
 
				 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
			
 
				 }
			
@@ -2407,6 +2825,12 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 		goto bad;
			
 
				 	}
			
 
				 	cache->cmd = cmd;
			
 
				+	set_cache_mode(cache, CM_WRITE);
			
 
				+	if (get_cache_mode(cache) != CM_WRITE) {
			
 
				+		*error = "Unable to get write access to metadata, please check/repair metadata.";
			
 
				+		r = -EINVAL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				 
			
 
				 	if (passthrough_mode(&cache->features)) {
			
 
				 		bool all_clean;
			
@@ -2425,6 +2849,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 	}
			
 
				 
			
 
				 	spin_lock_init(&cache->lock);
			
 
				+	INIT_LIST_HEAD(&cache->deferred_cells);
			
 
				 	bio_list_init(&cache->deferred_bios);
			
 
				 	bio_list_init(&cache->deferred_flush_bios);
			
 
				 	bio_list_init(&cache->deferred_writethrough_bios);
			
@@ -2514,6 +2939,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 	spin_lock_init(&cache->invalidation_lock);
			
 
				 	INIT_LIST_HEAD(&cache->invalidation_requests);
			
 
				 
			
 
				+	iot_init(&cache->origin_tracker);
			
 
				+
			
 
				 	*result = cache;
			
 
				 	return 0;
			
 
				 
			
@@ -2580,15 +3007,23 @@ out:
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int cache_map(struct dm_target *ti, struct bio *bio)
			
 
				 {
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				 	int r;
			
 
				+	struct dm_bio_prison_cell *cell = NULL;
			
 
				 	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				 	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				 	bool can_migrate = false;
			
 
				-	bool discarded_block;
			
 
				+	bool fast_promotion;
			
 
				 	struct policy_result lookup_result;
			
 
				 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
			
 
				+	struct old_oblock_lock ool;
			
 
				+
			
 
				+	ool.locker.fn = null_locker;
			
 
				 
			
 
				 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
			
 
				 		/*
			
@@ -2597,10 +3032,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 
				 		 * Just remap to the origin and carry on.
			
 
				 		 */
			
 
				 		remap_to_origin(cache, bio);
			
 
				+		accounted_begin(cache, bio);
			
 
				 		return DM_MAPIO_REMAPPED;
			
 
				 	}
			
 
				 
			
 
				-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
			
 
				+	if (discard_or_flush(bio)) {
			
 
				 		defer_bio(cache, bio);
			
 
				 		return DM_MAPIO_SUBMITTED;
			
 
				 	}
			
@@ -2608,15 +3044,15 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 
				 	/*
			
 
				 	 * Check to see if that block is currently migrating.
			
 
				 	 */
			
 
				-	*cell = alloc_prison_cell(cache);
			
 
				-	if (!*cell) {
			
 
				+	cell = alloc_prison_cell(cache);
			
 
				+	if (!cell) {
			
 
				 		defer_bio(cache, bio);
			
 
				 		return DM_MAPIO_SUBMITTED;
			
 
				 	}
			
 
				 
			
 
				-	r = bio_detain(cache, block, bio, *cell,
			
 
				+	r = bio_detain(cache, block, bio, cell,
			
 
				 		       (cell_free_fn) free_prison_cell,
			
 
				-		       cache, cell);
			
 
				+		       cache, &cell);
			
 
				 	if (r) {
			
 
				 		if (r < 0)
			
 
				 			defer_bio(cache, bio);
			
@@ -2624,17 +3060,18 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 
				 		return DM_MAPIO_SUBMITTED;
			
 
				 	}
			
 
				 
			
 
				-	discarded_block = is_discarded_oblock(cache, block);
			
 
				+	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
			
 
				 
			
 
				-	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
			
 
				-		       bio, &lookup_result);
			
 
				+	r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
			
 
				+		       bio, &ool.locker, &lookup_result);
			
 
				 	if (r == -EWOULDBLOCK) {
			
 
				-		cell_defer(cache, *cell, true);
			
 
				+		cell_defer(cache, cell, true);
			
 
				 		return DM_MAPIO_SUBMITTED;
			
 
				 
			
 
				 	} else if (r) {
			
 
				-		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
			
 
				-		cell_defer(cache, *cell, false);
			
 
				+		DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
			
 
				+			    cache_device_name(cache), r);
			
 
				+		cell_defer(cache, cell, false);
			
 
				 		bio_io_error(bio);
			
 
				 		return DM_MAPIO_SUBMITTED;
			
 
				 	}
			
@@ -2648,21 +3085,30 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 
				 				 * We need to invalidate this block, so
			
 
				 				 * defer for the worker thread.
			
 
				 				 */
			
 
				-				cell_defer(cache, *cell, true);
			
 
				+				cell_defer(cache, cell, true);
			
 
				 				r = DM_MAPIO_SUBMITTED;
			
 
				 
			
 
				 			} else {
			
 
				 				inc_miss_counter(cache, bio);
			
 
				 				remap_to_origin_clear_discard(cache, bio, block);
			
 
				+				accounted_begin(cache, bio);
			
 
				+				inc_ds(cache, bio, cell);
			
 
				+				// FIXME: we want to remap hits or misses straight
			
 
				+				// away rather than passing over to the worker.
			
 
				+				cell_defer(cache, cell, false);
			
 
				 			}
			
 
				 
			
 
				 		} else {
			
 
				 			inc_hit_counter(cache, bio);
			
 
				 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
			
 
				-			    !is_dirty(cache, lookup_result.cblock))
			
 
				+			    !is_dirty(cache, lookup_result.cblock)) {
			
 
				 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
			
 
				-			else
			
 
				-				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
			
 
				+				accounted_begin(cache, bio);
			
 
				+				inc_ds(cache, bio, cell);
			
 
				+				cell_defer(cache, cell, false);
			
 
				+
			
 
				+			} else
			
 
				+				remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
			
 
				 		}
			
 
				 		break;
			
 
				 
			
@@ -2674,18 +3120,19 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 
				 			 * longer needed because the block has been demoted.
			
 
				 			 */
			
 
				 			bio_endio(bio, 0);
			
 
				-			cell_defer(cache, *cell, false);
			
 
				+			// FIXME: remap everything as a miss
			
 
				+			cell_defer(cache, cell, false);
			
 
				 			r = DM_MAPIO_SUBMITTED;
			
 
				 
			
 
				 		} else
			
 
				-			remap_to_origin_clear_discard(cache, bio, block);
			
 
				-
			
 
				+			remap_cell_to_origin_clear_discard(cache, cell, block, false);
			
 
				 		break;
			
 
				 
			
 
				 	default:
			
 
				-		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
			
 
				+		DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
			
 
				+			    cache_device_name(cache), __func__,
			
 
				 			    (unsigned) lookup_result.op);
			
 
				-		cell_defer(cache, *cell, false);
			
 
				+		cell_defer(cache, cell, false);
			
 
				 		bio_io_error(bio);
			
 
				 		r = DM_MAPIO_SUBMITTED;
			
 
				 	}
			
@@ -2693,21 +3140,6 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-static int cache_map(struct dm_target *ti, struct bio *bio)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct dm_bio_prison_cell *cell = NULL;
			
 
				-	struct cache *cache = ti->private;
			
 
				-
			
 
				-	r = __cache_map(cache, bio, &cell);
			
 
				-	if (r == DM_MAPIO_REMAPPED && cell) {
			
 
				-		inc_ds(cache, bio, cell);
			
 
				-		cell_defer(cache, cell, false);
			
 
				-	}
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
			
 
				 {
			
 
				 	struct cache *cache = ti->private;
			
@@ -2716,7 +3148,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 
				 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				 
			
 
				 	if (pb->tick) {
			
 
				-		policy_tick(cache->policy);
			
 
				+		policy_tick(cache->policy, false);
			
 
				 
			
 
				 		spin_lock_irqsave(&cache->lock, flags);
			
 
				 		cache->need_tick_bio = true;
			
@@ -2724,6 +3156,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 
				 	}
			
 
				 
			
 
				 	check_for_quiesced_migrations(cache, pb);
			
 
				+	accounted_complete(cache, bio);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -2732,11 +3165,16 @@ static int write_dirty_bitset(struct cache *cache)
 
				 {
			
 
				 	unsigned i, r;
			
 
				 
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	for (i = 0; i < from_cblock(cache->cache_size); i++) {
			
 
				 		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
			
 
				 				       is_dirty(cache, to_cblock(i)));
			
 
				-		if (r)
			
 
				+		if (r) {
			
 
				+			metadata_operation_failed(cache, "dm_cache_set_dirty", r);
			
 
				 			return r;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -2746,18 +3184,40 @@ static int write_discard_bitset(struct cache *cache)
 
				 {
			
 
				 	unsigned i, r;
			
 
				 
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
			
 
				 					   cache->discard_nr_blocks);
			
 
				 	if (r) {
			
 
				-		DMERR("could not resize on-disk discard bitset");
			
 
				+		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
			
 
				+		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
 
				 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
			
 
				 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
			
 
				 					 is_discarded(cache, to_dblock(i)));
			
 
				-		if (r)
			
 
				+		if (r) {
			
 
				+			metadata_operation_failed(cache, "dm_cache_set_discard", r);
			
 
				 			return r;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int write_hints(struct cache *cache)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	r = dm_cache_write_hints(cache->cmd, cache->policy);
			
 
				+	if (r) {
			
 
				+		metadata_operation_failed(cache, "dm_cache_write_hints", r);
			
 
				+		return r;
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -2772,26 +3232,26 @@ static bool sync_metadata(struct cache *cache)
 
				 
			
 
				 	r1 = write_dirty_bitset(cache);
			
 
				 	if (r1)
			
 
				-		DMERR("could not write dirty bitset");
			
 
				+		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
			
 
				 
			
 
				 	r2 = write_discard_bitset(cache);
			
 
				 	if (r2)
			
 
				-		DMERR("could not write discard bitset");
			
 
				+		DMERR("%s: could not write discard bitset", cache_device_name(cache));
			
 
				 
			
 
				 	save_stats(cache);
			
 
				 
			
 
				-	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
			
 
				+	r3 = write_hints(cache);
			
 
				 	if (r3)
			
 
				-		DMERR("could not write hints");
			
 
				+		DMERR("%s: could not write hints", cache_device_name(cache));
			
 
				 
			
 
				 	/*
			
 
				 	 * If writing the above metadata failed, we still commit, but don't
			
 
				 	 * set the clean shutdown flag.  This will effectively force every
			
 
				 	 * dirty bit to be set on reload.
			
 
				 	 */
			
 
				-	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
			
 
				+	r4 = commit(cache, !r1 && !r2 && !r3);
			
 
				 	if (r4)
			
 
				-		DMERR("could not write cache metadata.  Data loss may occur.");
			
 
				+		DMERR("%s: could not write cache metadata", cache_device_name(cache));
			
 
				 
			
 
				 	return !r1 && !r2 && !r3 && !r4;
			
 
				 }
			
@@ -2803,10 +3263,12 @@ static void cache_postsuspend(struct dm_target *ti)
 
				 	start_quiescing(cache);
			
 
				 	wait_for_migrations(cache);
			
 
				 	stop_worker(cache);
			
 
				-	requeue_deferred_io(cache);
			
 
				+	requeue_deferred_bios(cache);
			
 
				+	requeue_deferred_cells(cache);
			
 
				 	stop_quiescing(cache);
			
 
				 
			
 
				-	(void) sync_metadata(cache);
			
 
				+	if (get_cache_mode(cache) == CM_WRITE)
			
 
				+		(void) sync_metadata(cache);
			
 
				 }
			
 
				 
			
 
				 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
			
@@ -2929,7 +3391,8 @@ static bool can_resize(struct cache *cache, dm_cblock_t new_size)
 
				 	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
			
 
				 		new_size = to_cblock(from_cblock(new_size) + 1);
			
 
				 		if (is_dirty(cache, new_size)) {
			
 
				-			DMERR("unable to shrink cache; cache block %llu is dirty",
			
 
				+			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
			
 
				+			      cache_device_name(cache),
			
 
				 			      (unsigned long long) from_cblock(new_size));
			
 
				 			return false;
			
 
				 		}
			
@@ -2944,7 +3407,8 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 
				 
			
 
				 	r = dm_cache_resize(cache->cmd, new_size);
			
 
				 	if (r) {
			
 
				-		DMERR("could not resize cache metadata");
			
 
				+		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
			
 
				+		metadata_operation_failed(cache, "dm_cache_resize", r);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
@@ -2982,7 +3446,8 @@ static int cache_preresume(struct dm_target *ti)
 
				 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
			
 
				 					   load_mapping, cache);
			
 
				 		if (r) {
			
 
				-			DMERR("could not load cache mappings");
			
 
				+			DMERR("%s: could not load cache mappings", cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
			
 
				 			return r;
			
 
				 		}
			
 
				 
			
@@ -3002,7 +3467,8 @@ static int cache_preresume(struct dm_target *ti)
 
				 		discard_load_info_init(cache, &li);
			
 
				 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
			
 
				 		if (r) {
			
 
				-			DMERR("could not load origin discards");
			
 
				+			DMERR("%s: could not load origin discards", cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_load_discards", r);
			
 
				 			return r;
			
 
				 		}
			
 
				 		set_discard_range(&li);
			
@@ -3030,7 +3496,7 @@ static void cache_resume(struct dm_target *ti)
 
				  * <#demotions> <#promotions> <#dirty>
			
 
				  * <#features> <features>*
			
 
				  * <#core args> <core args>
			
 
				- * <policy name> <#policy args> <policy args>*
			
 
				+ * <policy name> <#policy args> <policy args>* <cache metadata mode>
			
 
				  */
			
 
				 static void cache_status(struct dm_target *ti, status_type_t type,
			
 
				 			 unsigned status_flags, char *result, unsigned maxlen)
			
@@ -3046,23 +3512,26 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
				 
			
 
				 	switch (type) {
			
 
				 	case STATUSTYPE_INFO:
			
 
				-		/* Commit to ensure statistics aren't out-of-date */
			
 
				-		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
			
 
				-			r = dm_cache_commit(cache->cmd, false);
			
 
				-			if (r)
			
 
				-				DMERR("could not commit metadata for accurate status");
			
 
				+		if (get_cache_mode(cache) == CM_FAIL) {
			
 
				+			DMEMIT("Fail");
			
 
				+			break;
			
 
				 		}
			
 
				 
			
 
				-		r = dm_cache_get_free_metadata_block_count(cache->cmd,
			
 
				-							   &nr_free_blocks_metadata);
			
 
				+		/* Commit to ensure statistics aren't out-of-date */
			
 
				+		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
			
 
				+			(void) commit(cache, false);
			
 
				+
			
 
				+		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
			
 
				 		if (r) {
			
 
				-			DMERR("could not get metadata free block count");
			
 
				+			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
			
 
				+			      cache_device_name(cache), r);
			
 
				 			goto err;
			
 
				 		}
			
 
				 
			
 
				 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
			
 
				 		if (r) {
			
 
				-			DMERR("could not get metadata device size");
			
 
				+			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
			
 
				+			      cache_device_name(cache), r);
			
 
				 			goto err;
			
 
				 		}
			
 
				 
			
@@ -3093,7 +3562,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
				 			DMEMIT("1 writeback ");
			
 
				 
			
 
				 		else {
			
 
				-			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
			
 
				+			DMERR("%s: internal error: unknown io mode: %d",
			
 
				+			      cache_device_name(cache), (int) cache->features.io_mode);
			
 
				 			goto err;
			
 
				 		}
			
 
				 
			
@@ -3101,11 +3571,17 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
				 
			
 
				 		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
			
 
				 		if (sz < maxlen) {
			
 
				-			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
			
 
				+			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
			
 
				 			if (r)
			
 
				-				DMERR("policy_emit_config_values returned %d", r);
			
 
				+				DMERR("%s: policy_emit_config_values returned %d",
			
 
				+				      cache_device_name(cache), r);
			
 
				 		}
			
 
				 
			
 
				+		if (get_cache_mode(cache) == CM_READ_ONLY)
			
 
				+			DMEMIT("ro ");
			
 
				+		else
			
 
				+			DMEMIT("rw ");
			
 
				+
			
 
				 		break;
			
 
				 
			
 
				 	case STATUSTYPE_TABLE:
			
@@ -3167,7 +3643,7 @@ static int parse_cblock_range(struct cache *cache, const char *str,
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	DMERR("invalid cblock range '%s'", str);
			
 
				+	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
			
 
				 	return -EINVAL;
			
 
				 }
			
 
				 
			
@@ -3178,17 +3654,20 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
 
				 	uint64_t n = from_cblock(cache->cache_size);
			
 
				 
			
 
				 	if (b >= n) {
			
 
				-		DMERR("begin cblock out of range: %llu >= %llu", b, n);
			
 
				+		DMERR("%s: begin cblock out of range: %llu >= %llu",
			
 
				+		      cache_device_name(cache), b, n);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				 	if (e > n) {
			
 
				-		DMERR("end cblock out of range: %llu > %llu", e, n);
			
 
				+		DMERR("%s: end cblock out of range: %llu > %llu",
			
 
				+		      cache_device_name(cache), e, n);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				 	if (b >= e) {
			
 
				-		DMERR("invalid cblock range: %llu >= %llu", b, e);
			
 
				+		DMERR("%s: invalid cblock range: %llu >= %llu",
			
 
				+		      cache_device_name(cache), b, e);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
@@ -3222,7 +3701,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
 
				 	struct cblock_range range;
			
 
				 
			
 
				 	if (!passthrough_mode(&cache->features)) {
			
 
				-		DMERR("cache has to be in passthrough mode for invalidation");
			
 
				+		DMERR("%s: cache has to be in passthrough mode for invalidation",
			
 
				+		      cache_device_name(cache));
			
 
				 		return -EPERM;
			
 
				 	}
			
 
				 
			
@@ -3261,6 +3741,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
 
				 	if (!argc)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY) {
			
 
				+		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
			
 
				+		      cache_device_name(cache));
			
 
				+		return -EOPNOTSUPP;
			
 
				+	}
			
 
				+
			
 
				 	if (!strcasecmp(argv[0], "invalidate_cblocks"))
			
 
				 		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
			
 
				 
			
@@ -3334,7 +3820,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
				 
			
 
				 static struct target_type cache_target = {
			
 
				 	.name = "cache",
			
 
				-	.version = {1, 6, 0},
			
 
				+	.version = {1, 7, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = cache_ctr,
			
 
				 	.dtr = cache_dtr,
			
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * Copyright (C) 2003 Jana Saout <jana@saout.de>
			
 
				  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
			
 
				- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
			
 
				+ * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
			
 
				  * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
			
 
				  *
			
 
				  * This file is released under the GPL.
			
@@ -891,6 +891,11 @@ static void crypt_alloc_req(struct crypt_config *cc,
 
				 		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
			
 
				 
			
 
				 	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
			
 
				+
			
 
				+	/*
			
 
				+	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
			
 
				+	 * requests if driver request queue is full.
			
 
				+	 */
			
 
				 	ablkcipher_request_set_callback(ctx->req,
			
 
				 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
			
 
				 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
			
@@ -924,24 +929,32 @@ static int crypt_convert(struct crypt_config *cc,
 
				 		r = crypt_convert_block(cc, ctx, ctx->req);
			
 
				 
			
 
				 		switch (r) {
			
 
				-		/* async */
			
 
				+		/*
			
 
				+		 * The request was queued by a crypto driver
			
 
				+		 * but the driver request queue is full, let's wait.
			
 
				+		 */
			
 
				 		case -EBUSY:
			
 
				 			wait_for_completion(&ctx->restart);
			
 
				 			reinit_completion(&ctx->restart);
			
 
				-			/* fall through*/
			
 
				+			/* fall through */
			
 
				+		/*
			
 
				+		 * The request is queued and processed asynchronously,
			
 
				+		 * completion function kcryptd_async_done() will be called.
			
 
				+		 */
			
 
				 		case -EINPROGRESS:
			
 
				 			ctx->req = NULL;
			
 
				 			ctx->cc_sector++;
			
 
				 			continue;
			
 
				-
			
 
				-		/* sync */
			
 
				+		/*
			
 
				+		 * The request was already processed (synchronously).
			
 
				+		 */
			
 
				 		case 0:
			
 
				 			atomic_dec(&ctx->cc_pending);
			
 
				 			ctx->cc_sector++;
			
 
				 			cond_resched();
			
 
				 			continue;
			
 
				 
			
 
				-		/* error */
			
 
				+		/* There was an error while processing the request. */
			
 
				 		default:
			
 
				 			atomic_dec(&ctx->cc_pending);
			
 
				 			return r;
			
@@ -1346,6 +1359,11 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 
				 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
			
 
				 	struct crypt_config *cc = io->cc;
			
 
				 
			
 
				+	/*
			
 
				+	 * A request from crypto driver backlog is going to be processed now,
			
 
				+	 * finish the completion and continue in crypt_convert().
			
 
				+	 * (Callback will be called for the second time for this request.)
			
 
				+	 */
			
 
				 	if (error == -EINPROGRESS) {
			
 
				 		complete(&ctx->restart);
			
 
				 		return;
			
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -55,8 +55,8 @@
 
				 #define LOG_DISCARD_FLAG (1 << 2)
			
 
				 #define LOG_MARK_FLAG (1 << 3)
			
 
				 
			
 
				-#define WRITE_LOG_VERSION 1
			
 
				-#define WRITE_LOG_MAGIC 0x6a736677736872
			
 
				+#define WRITE_LOG_VERSION 1ULL
			
 
				+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
			
 
				 
			
 
				 /*
			
 
				  * The disk format for this is braindead simple.
			
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * Copyright (C) 2010-2011 Neil Brown
			
 
				- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
			
 
				+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
			
 
				  *
			
 
				  * This file is released under the GPL.
			
 
				  */
			
@@ -17,6 +17,7 @@
 
				 #include <linux/device-mapper.h>
			
 
				 
			
 
				 #define DM_MSG_PREFIX "raid"
			
 
				+#define	MAX_RAID_DEVICES	253 /* raid4/5/6 limit */
			
 
				 
			
 
				 static bool devices_handle_discard_safely = false;
			
 
				 
			
@@ -45,25 +46,25 @@ struct raid_dev {
 
				 };
			
 
				 
			
 
				 /*
			
 
				- * Flags for rs->print_flags field.
			
 
				+ * Flags for rs->ctr_flags field.
			
 
				  */
			
 
				-#define DMPF_SYNC              0x1
			
 
				-#define DMPF_NOSYNC            0x2
			
 
				-#define DMPF_REBUILD           0x4
			
 
				-#define DMPF_DAEMON_SLEEP      0x8
			
 
				-#define DMPF_MIN_RECOVERY_RATE 0x10
			
 
				-#define DMPF_MAX_RECOVERY_RATE 0x20
			
 
				-#define DMPF_MAX_WRITE_BEHIND  0x40
			
 
				-#define DMPF_STRIPE_CACHE      0x80
			
 
				-#define DMPF_REGION_SIZE       0x100
			
 
				-#define DMPF_RAID10_COPIES     0x200
			
 
				-#define DMPF_RAID10_FORMAT     0x400
			
 
				+#define CTR_FLAG_SYNC              0x1
			
 
				+#define CTR_FLAG_NOSYNC            0x2
			
 
				+#define CTR_FLAG_REBUILD           0x4
			
 
				+#define CTR_FLAG_DAEMON_SLEEP      0x8
			
 
				+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
			
 
				+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
			
 
				+#define CTR_FLAG_MAX_WRITE_BEHIND  0x40
			
 
				+#define CTR_FLAG_STRIPE_CACHE      0x80
			
 
				+#define CTR_FLAG_REGION_SIZE       0x100
			
 
				+#define CTR_FLAG_RAID10_COPIES     0x200
			
 
				+#define CTR_FLAG_RAID10_FORMAT     0x400
			
 
				 
			
 
				 struct raid_set {
			
 
				 	struct dm_target *ti;
			
 
				 
			
 
				 	uint32_t bitmap_loaded;
			
 
				-	uint32_t print_flags;
			
 
				+	uint32_t ctr_flags;
			
 
				 
			
 
				 	struct mddev md;
			
 
				 	struct raid_type *raid_type;
			
@@ -81,6 +82,7 @@ static struct raid_type {
 
				 	const unsigned level;		/* RAID level. */
			
 
				 	const unsigned algorithm;	/* RAID algorithm. */
			
 
				 } raid_types[] = {
			
 
				+	{"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
			
 
				 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
			
 
				 	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
			
 
				 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
			
@@ -119,15 +121,15 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
 
				 {
			
 
				 	unsigned n = 1, f = 1;
			
 
				 
			
 
				-	if (!strcmp("near", format))
			
 
				+	if (!strcasecmp("near", format))
			
 
				 		n = copies;
			
 
				 	else
			
 
				 		f = copies;
			
 
				 
			
 
				-	if (!strcmp("offset", format))
			
 
				+	if (!strcasecmp("offset", format))
			
 
				 		return 0x30000 | (f << 8) | n;
			
 
				 
			
 
				-	if (!strcmp("far", format))
			
 
				+	if (!strcasecmp("far", format))
			
 
				 		return 0x20000 | (f << 8) | n;
			
 
				 
			
 
				 	return (f << 8) | n;
			
@@ -477,8 +479,6 @@ too_many:
 
				  *                                      will form the "stripe"
			
 
				  *    [[no]sync]			Force or prevent recovery of the
			
 
				  *                                      entire array
			
 
				- *    [devices_handle_discard_safely]	Allow discards on RAID4/5/6; useful if RAID
			
 
				- *					member device(s) properly support TRIM/UNMAP
			
 
				  *    [rebuild <idx>]			Rebuild the drive indicated by the index
			
 
				  *    [daemon_sleep <ms>]		Time between bitmap daemon work to
			
 
				  *                                      clear bits
			
@@ -555,12 +555,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 	for (i = 0; i < num_raid_params; i++) {
			
 
				 		if (!strcasecmp(argv[i], "nosync")) {
			
 
				 			rs->md.recovery_cp = MaxSector;
			
 
				-			rs->print_flags |= DMPF_NOSYNC;
			
 
				+			rs->ctr_flags |= CTR_FLAG_NOSYNC;
			
 
				 			continue;
			
 
				 		}
			
 
				 		if (!strcasecmp(argv[i], "sync")) {
			
 
				 			rs->md.recovery_cp = 0;
			
 
				-			rs->print_flags |= DMPF_SYNC;
			
 
				+			rs->ctr_flags |= CTR_FLAG_SYNC;
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 				return -EINVAL;
			
 
				 			}
			
 
				 			raid10_format = argv[i];
			
 
				-			rs->print_flags |= DMPF_RAID10_FORMAT;
			
 
				+			rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
@@ -602,7 +602,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 			}
			
 
				 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
			
 
				 			rs->dev[value].rdev.recovery_offset = 0;
			
 
				-			rs->print_flags |= DMPF_REBUILD;
			
 
				+			rs->ctr_flags |= CTR_FLAG_REBUILD;
			
 
				 		} else if (!strcasecmp(key, "write_mostly")) {
			
 
				 			if (rs->raid_type->level != 1) {
			
 
				 				rs->ti->error = "write_mostly option is only valid for RAID1";
			
@@ -618,7 +618,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 				rs->ti->error = "max_write_behind option is only valid for RAID1";
			
 
				 				return -EINVAL;
			
 
				 			}
			
 
				-			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
			
 
				+			rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
			
 
				 
			
 
				 			/*
			
 
				 			 * In device-mapper, we specify things in sectors, but
			
@@ -631,14 +631,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 			}
			
 
				 			rs->md.bitmap_info.max_write_behind = value;
			
 
				 		} else if (!strcasecmp(key, "daemon_sleep")) {
			
 
				-			rs->print_flags |= DMPF_DAEMON_SLEEP;
			
 
				+			rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
			
 
				 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
			
 
				 				rs->ti->error = "daemon sleep period out of range";
			
 
				 				return -EINVAL;
			
 
				 			}
			
 
				 			rs->md.bitmap_info.daemon_sleep = value;
			
 
				 		} else if (!strcasecmp(key, "stripe_cache")) {
			
 
				-			rs->print_flags |= DMPF_STRIPE_CACHE;
			
 
				+			rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
			
 
				 
			
 
				 			/*
			
 
				 			 * In device-mapper, we specify things in sectors, but
			
@@ -656,21 +656,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 				return -EINVAL;
			
 
				 			}
			
 
				 		} else if (!strcasecmp(key, "min_recovery_rate")) {
			
 
				-			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
			
 
				+			rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
			
 
				 			if (value > INT_MAX) {
			
 
				 				rs->ti->error = "min_recovery_rate out of range";
			
 
				 				return -EINVAL;
			
 
				 			}
			
 
				 			rs->md.sync_speed_min = (int)value;
			
 
				 		} else if (!strcasecmp(key, "max_recovery_rate")) {
			
 
				-			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
			
 
				+			rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
			
 
				 			if (value > INT_MAX) {
			
 
				 				rs->ti->error = "max_recovery_rate out of range";
			
 
				 				return -EINVAL;
			
 
				 			}
			
 
				 			rs->md.sync_speed_max = (int)value;
			
 
				 		} else if (!strcasecmp(key, "region_size")) {
			
 
				-			rs->print_flags |= DMPF_REGION_SIZE;
			
 
				+			rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
			
 
				 			region_size = value;
			
 
				 		} else if (!strcasecmp(key, "raid10_copies") &&
			
 
				 			   (rs->raid_type->level == 10)) {
			
@@ -678,7 +678,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 				rs->ti->error = "Bad value for 'raid10_copies'";
			
 
				 				return -EINVAL;
			
 
				 			}
			
 
				-			rs->print_flags |= DMPF_RAID10_COPIES;
			
 
				+			rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
			
 
				 			raid10_copies = value;
			
 
				 		} else {
			
 
				 			DMERR("Unable to parse RAID parameter: %s", key);
			
@@ -720,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
				 		rs->md.layout = raid10_format_to_md_layout(raid10_format,
			
 
				 							   raid10_copies);
			
 
				 		rs->md.new_layout = rs->md.layout;
			
 
				-	} else if ((rs->raid_type->level > 1) &&
			
 
				+	} else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
			
 
				 		   sector_div(sectors_per_dev,
			
 
				 			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
			
 
				 		rs->ti->error = "Target length not divisible by number of data devices";
			
@@ -947,7 +947,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
			
 
				+	if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
			
 
				 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
			
 
				 
			
 
				 	/*
			
@@ -1026,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
			
 
				+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
			
 
				 {
			
 
				+	struct mddev *mddev = &rs->md;
			
 
				 	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
			
 
				 
			
 
				 	/*
			
@@ -1037,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 
				 	if (!mddev->events && super_init_validation(mddev, rdev))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
			
 
				-	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
			
 
				+	/* Enable bitmap creation for RAID levels != 0 */
			
 
				+	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
			
 
				+	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
			
 
				+
			
 
				 	if (!test_bit(FirstUse, &rdev->flags)) {
			
 
				 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
			
 
				 		if (rdev->recovery_offset != MaxSector)
			
@@ -1073,7 +1076,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 
				 	freshest = NULL;
			
 
				 	rdev_for_each_safe(rdev, tmp, mddev) {
			
 
				 		/*
			
 
				-		 * Skipping super_load due to DMPF_SYNC will cause
			
 
				+		 * Skipping super_load due to CTR_FLAG_SYNC will cause
			
 
				 		 * the array to undergo initialization again as
			
 
				 		 * though it were new.  This is the intended effect
			
 
				 		 * of the "sync" directive.
			
@@ -1082,7 +1085,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 
				 		 * that the "sync" directive is disallowed during the
			
 
				 		 * reshape.
			
 
				 		 */
			
 
				-		if (rs->print_flags & DMPF_SYNC)
			
 
				+		rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
			
 
				+
			
 
				+		if (rs->ctr_flags & CTR_FLAG_SYNC)
			
 
				 			continue;
			
 
				 
			
 
				 		if (!rdev->meta_bdev)
			
@@ -1140,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 
				 	 * validation for the remaining devices.
			
 
				 	 */
			
 
				 	ti->error = "Unable to assemble array: Invalid superblocks";
			
 
				-	if (super_validate(mddev, freshest))
			
 
				+	if (super_validate(rs, freshest))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	rdev_for_each(rdev, mddev)
			
 
				-		if ((rdev != freshest) && super_validate(mddev, rdev))
			
 
				+		if ((rdev != freshest) && super_validate(rs, rdev))
			
 
				 			return -EINVAL;
			
 
				 
			
 
				 	return 0;
			
@@ -1243,7 +1248,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
			
 
				-	    (num_raid_devs >= INT_MAX)) {
			
 
				+	    (num_raid_devs > MAX_RAID_DEVICES)) {
			
 
				 		ti->error = "Cannot understand number of raid devices";
			
 
				 		return -EINVAL;
			
 
				 	}
			
@@ -1282,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	 */
			
 
				 	configure_discard_support(ti, rs);
			
 
				 
			
 
				-	mutex_lock(&rs->md.reconfig_mutex);
			
 
				+	/* Has to be held on running the array */
			
 
				+	mddev_lock_nointr(&rs->md);
			
 
				 	ret = md_run(&rs->md);
			
 
				 	rs->md.in_sync = 0; /* Assume already marked dirty */
			
 
				-	mutex_unlock(&rs->md.reconfig_mutex);
			
 
				+	mddev_unlock(&rs->md);
			
 
				 
			
 
				 	if (ret) {
			
 
				 		ti->error = "Fail to run raid array";
			
@@ -1368,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 	case STATUSTYPE_INFO:
			
 
				 		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
			
 
				 
			
 
				-		if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
			
 
				-			sync = rs->md.curr_resync_completed;
			
 
				-		else
			
 
				-			sync = rs->md.recovery_cp;
			
 
				-
			
 
				-		if (sync >= rs->md.resync_max_sectors) {
			
 
				-			/*
			
 
				-			 * Sync complete.
			
 
				-			 */
			
 
				+		if (rs->raid_type->level) {
			
 
				+			if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
			
 
				+				sync = rs->md.curr_resync_completed;
			
 
				+			else
			
 
				+				sync = rs->md.recovery_cp;
			
 
				+
			
 
				+			if (sync >= rs->md.resync_max_sectors) {
			
 
				+				/*
			
 
				+				 * Sync complete.
			
 
				+				 */
			
 
				+				array_in_sync = 1;
			
 
				+				sync = rs->md.resync_max_sectors;
			
 
				+			} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
			
 
				+				/*
			
 
				+				 * If "check" or "repair" is occurring, the array has
			
 
				+				 * undergone and initial sync and the health characters
			
 
				+				 * should not be 'a' anymore.
			
 
				+				 */
			
 
				+				array_in_sync = 1;
			
 
				+			} else {
			
 
				+				/*
			
 
				+				 * The array may be doing an initial sync, or it may
			
 
				+				 * be rebuilding individual components.  If all the
			
 
				+				 * devices are In_sync, then it is the array that is
			
 
				+				 * being initialized.
			
 
				+				 */
			
 
				+				for (i = 0; i < rs->md.raid_disks; i++)
			
 
				+					if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
			
 
				+						array_in_sync = 1;
			
 
				+			}
			
 
				+		} else {
			
 
				+			/* RAID0 */
			
 
				 			array_in_sync = 1;
			
 
				 			sync = rs->md.resync_max_sectors;
			
 
				-		} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
			
 
				-			/*
			
 
				-			 * If "check" or "repair" is occurring, the array has
			
 
				-			 * undergone and initial sync and the health characters
			
 
				-			 * should not be 'a' anymore.
			
 
				-			 */
			
 
				-			array_in_sync = 1;
			
 
				-		} else {
			
 
				-			/*
			
 
				-			 * The array may be doing an initial sync, or it may
			
 
				-			 * be rebuilding individual components.  If all the
			
 
				-			 * devices are In_sync, then it is the array that is
			
 
				-			 * being initialized.
			
 
				-			 */
			
 
				-			for (i = 0; i < rs->md.raid_disks; i++)
			
 
				-				if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
			
 
				-					array_in_sync = 1;
			
 
				 		}
			
 
				 
			
 
				 		/*
			
@@ -1446,7 +1458,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 	case STATUSTYPE_TABLE:
			
 
				 		/* The string you would use to construct this array */
			
 
				 		for (i = 0; i < rs->md.raid_disks; i++) {
			
 
				-			if ((rs->print_flags & DMPF_REBUILD) &&
			
 
				+			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
			
 
				 			    rs->dev[i].data_dev &&
			
 
				 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
			
 
				 				raid_param_cnt += 2; /* for rebuilds */
			
@@ -1455,33 +1467,33 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 				raid_param_cnt += 2;
			
 
				 		}
			
 
				 
			
 
				-		raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
			
 
				-		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
			
 
				+		raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
			
 
				+		if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
			
 
				 			raid_param_cnt--;
			
 
				 
			
 
				 		DMEMIT("%s %u %u", rs->raid_type->name,
			
 
				 		       raid_param_cnt, rs->md.chunk_sectors);
			
 
				 
			
 
				-		if ((rs->print_flags & DMPF_SYNC) &&
			
 
				+		if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
			
 
				 		    (rs->md.recovery_cp == MaxSector))
			
 
				 			DMEMIT(" sync");
			
 
				-		if (rs->print_flags & DMPF_NOSYNC)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_NOSYNC)
			
 
				 			DMEMIT(" nosync");
			
 
				 
			
 
				 		for (i = 0; i < rs->md.raid_disks; i++)
			
 
				-			if ((rs->print_flags & DMPF_REBUILD) &&
			
 
				+			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
			
 
				 			    rs->dev[i].data_dev &&
			
 
				 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
			
 
				 				DMEMIT(" rebuild %u", i);
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_DAEMON_SLEEP)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
			
 
				 			DMEMIT(" daemon_sleep %lu",
			
 
				 			       rs->md.bitmap_info.daemon_sleep);
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
			
 
				 			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
			
 
				 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
			
 
				 
			
 
				 		for (i = 0; i < rs->md.raid_disks; i++)
			
@@ -1489,11 +1501,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
			
 
				 				DMEMIT(" write_mostly %u", i);
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
			
 
				 			DMEMIT(" max_write_behind %lu",
			
 
				 			       rs->md.bitmap_info.max_write_behind);
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_STRIPE_CACHE) {
			
 
				+		if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
			
 
				 			struct r5conf *conf = rs->md.private;
			
 
				 
			
 
				 			/* convert from kiB to sectors */
			
@@ -1501,15 +1513,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 			       conf ? conf->max_nr_stripes * 2 : 0);
			
 
				 		}
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_REGION_SIZE)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
			
 
				 			DMEMIT(" region_size %lu",
			
 
				 			       rs->md.bitmap_info.chunksize >> 9);
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_RAID10_COPIES)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
			
 
				 			DMEMIT(" raid10_copies %u",
			
 
				 			       raid10_md_layout_to_copies(rs->md.layout));
			
 
				 
			
 
				-		if (rs->print_flags & DMPF_RAID10_FORMAT)
			
 
				+		if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
			
 
				 			DMEMIT(" raid10_format %s",
			
 
				 			       raid10_md_layout_to_format(rs->md.layout));
			
 
				 
			
@@ -1684,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
 
				 {
			
 
				 	struct raid_set *rs = ti->private;
			
 
				 
			
 
				-	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
			
 
				-	if (!rs->bitmap_loaded) {
			
 
				-		bitmap_load(&rs->md);
			
 
				-		rs->bitmap_loaded = 1;
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * A secondary resume while the device is active.
			
 
				-		 * Take this opportunity to check whether any failed
			
 
				-		 * devices are reachable again.
			
 
				-		 */
			
 
				-		attempt_restore_of_faulty_devices(rs);
			
 
				+	if (rs->raid_type->level) {
			
 
				+		set_bit(MD_CHANGE_DEVS, &rs->md.flags);
			
 
				+
			
 
				+		if (!rs->bitmap_loaded) {
			
 
				+			bitmap_load(&rs->md);
			
 
				+			rs->bitmap_loaded = 1;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * A secondary resume while the device is active.
			
 
				+			 * Take this opportunity to check whether any failed
			
 
				+			 * devices are reachable again.
			
 
				+			 */
			
 
				+			attempt_restore_of_faulty_devices(rs);
			
 
				+		}
			
 
				+
			
 
				+		clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
			
 
				 	}
			
 
				 
			
 
				-	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
			
 
				 	mddev_resume(&rs->md);
			
 
				 }
			
 
				 
			
 
				+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
			
 
				+		      struct bio_vec *biovec, int max_size)
			
 
				+{
			
 
				+	struct raid_set *rs = ti->private;
			
 
				+	struct md_personality *pers = rs->md.pers;
			
 
				+
			
 
				+	if (pers && pers->mergeable_bvec)
			
 
				+		return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
			
 
				+
			
 
				+	/*
			
 
				+	 * In case we can't request the personality because
			
 
				+	 * the raid set is not running yet
			
 
				+	 *
			
 
				+	 * -> return safe minimum
			
 
				+	 */
			
 
				+	return rs->md.chunk_sectors;
			
 
				+}
			
 
				+
			
 
				 static struct target_type raid_target = {
			
 
				 	.name = "raid",
			
 
				-	.version = {1, 6, 0},
			
 
				+	.version = {1, 7, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = raid_ctr,
			
 
				 	.dtr = raid_dtr,
			
@@ -1715,6 +1749,7 @@ static struct target_type raid_target = {
 
				 	.presuspend = raid_presuspend,
			
 
				 	.postsuspend = raid_postsuspend,
			
 
				 	.resume = raid_resume,
			
 
				+	.merge = raid_merge,
			
 
				 };
			
 
				 
			
 
				 static int __init dm_raid_init(void)
			
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -23,8 +23,10 @@
 
				 
			
 
				 #define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
			
 
				 
			
 
				-#define DM_RAID1_HANDLE_ERRORS 0x01
			
 
				+#define DM_RAID1_HANDLE_ERRORS	0x01
			
 
				+#define DM_RAID1_KEEP_LOG	0x02
			
 
				 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
			
 
				+#define keep_log(p)		((p)->features & DM_RAID1_KEEP_LOG)
			
 
				 
			
 
				 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
			
 
				 
			
@@ -229,7 +231,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 
				 	if (m != get_default_mirror(ms))
			
 
				 		goto out;
			
 
				 
			
 
				-	if (!ms->in_sync) {
			
 
				+	if (!ms->in_sync && !keep_log(ms)) {
			
 
				 		/*
			
 
				 		 * Better to issue requests to same failing device
			
 
				 		 * than to risk returning corrupt data.
			
@@ -370,6 +372,17 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+static void reset_ms_flags(struct mirror_set *ms)
			
 
				+{
			
 
				+	unsigned int m;
			
 
				+
			
 
				+	ms->leg_failure = 0;
			
 
				+	for (m = 0; m < ms->nr_mirrors; m++) {
			
 
				+		atomic_set(&(ms->mirror[m].error_count), 0);
			
 
				+		ms->mirror[m].error_type = 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void do_recovery(struct mirror_set *ms)
			
 
				 {
			
 
				 	struct dm_region *reg;
			
@@ -398,6 +411,7 @@ static void do_recovery(struct mirror_set *ms)
 
				 		/* the sync is complete */
			
 
				 		dm_table_event(ms->ti->table);
			
 
				 		ms->in_sync = 1;
			
 
				+		reset_ms_flags(ms);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -759,7 +773,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 
				 		dm_rh_delay(ms->rh, bio);
			
 
				 
			
 
				 	while ((bio = bio_list_pop(&nosync))) {
			
 
				-		if (unlikely(ms->leg_failure) && errors_handled(ms)) {
			
 
				+		if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) {
			
 
				 			spin_lock_irq(&ms->lock);
			
 
				 			bio_list_add(&ms->failures, bio);
			
 
				 			spin_unlock_irq(&ms->lock);
			
@@ -803,15 +817,21 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 
				 
			
 
				 		/*
			
 
				 		 * If all the legs are dead, fail the I/O.
			
 
				-		 * If we have been told to handle errors, hold the bio
			
 
				-		 * and wait for userspace to deal with the problem.
			
 
				+		 * If the device has failed and keep_log is enabled,
			
 
				+		 * fail the I/O.
			
 
				+		 *
			
 
				+		 * If we have been told to handle errors, and keep_log
			
 
				+		 * isn't enabled, hold the bio and wait for userspace to
			
 
				+		 * deal with the problem.
			
 
				+		 *
			
 
				 		 * Otherwise pretend that the I/O succeeded. (This would
			
 
				 		 * be wrong if the failed leg returned after reboot and
			
 
				 		 * got replicated back to the good legs.)
			
 
				 		 */
			
 
				-		if (!get_valid_mirror(ms))
			
 
				+
			
 
				+		if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
			
 
				 			bio_endio(bio, -EIO);
			
 
				-		else if (errors_handled(ms))
			
 
				+		else if (errors_handled(ms) && !keep_log(ms))
			
 
				 			hold_bio(ms, bio);
			
 
				 		else
			
 
				 			bio_endio(bio, 0);
			
@@ -987,6 +1007,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 
				 	unsigned num_features;
			
 
				 	struct dm_target *ti = ms->ti;
			
 
				 	char dummy;
			
 
				+	int i;
			
 
				 
			
 
				 	*args_used = 0;
			
 
				 
			
@@ -1007,15 +1028,25 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if (!strcmp("handle_errors", argv[0]))
			
 
				-		ms->features |= DM_RAID1_HANDLE_ERRORS;
			
 
				-	else {
			
 
				-		ti->error = "Unrecognised feature requested";
			
 
				+	for (i = 0; i < num_features; i++) {
			
 
				+		if (!strcmp("handle_errors", argv[0]))
			
 
				+			ms->features |= DM_RAID1_HANDLE_ERRORS;
			
 
				+		else if (!strcmp("keep_log", argv[0]))
			
 
				+			ms->features |= DM_RAID1_KEEP_LOG;
			
 
				+		else {
			
 
				+			ti->error = "Unrecognised feature requested";
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+
			
 
				+		argc--;
			
 
				+		argv++;
			
 
				+		(*args_used)++;
			
 
				+	}
			
 
				+	if (!errors_handled(ms) && keep_log(ms)) {
			
 
				+		ti->error = "keep_log feature requires the handle_errors feature";
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	(*args_used)++;
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1029,7 +1060,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 
				  * log_type is "core" or "disk"
			
 
				  * #log_params is between 1 and 3
			
 
				  *
			
 
				- * If present, features must be "handle_errors".
			
 
				+ * If present, supported features are "handle_errors" and "keep_log".
			
 
				  */
			
 
				 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
			
 
				 {
			
@@ -1363,6 +1394,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 
				 			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	unsigned int m, sz = 0;
			
 
				+	int num_feature_args = 0;
			
 
				 	struct mirror_set *ms = (struct mirror_set *) ti->private;
			
 
				 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
			
 
				 	char buffer[ms->nr_mirrors + 1];
			
@@ -1392,8 +1424,17 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 
				 			DMEMIT(" %s %llu", ms->mirror[m].dev->name,
			
 
				 			       (unsigned long long)ms->mirror[m].offset);
			
 
				 
			
 
				-		if (ms->features & DM_RAID1_HANDLE_ERRORS)
			
 
				-			DMEMIT(" 1 handle_errors");
			
 
				+		num_feature_args += !!errors_handled(ms);
			
 
				+		num_feature_args += !!keep_log(ms);
			
 
				+		if (num_feature_args) {
			
 
				+			DMEMIT(" %d", num_feature_args);
			
 
				+			if (errors_handled(ms))
			
 
				+				DMEMIT(" handle_errors");
			
 
				+			if (keep_log(ms))
			
 
				+				DMEMIT(" keep_log");
			
 
				+		}
			
 
				+
			
 
				+		break;
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1413,7 +1454,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static struct target_type mirror_target = {
			
 
				 	.name	 = "mirror",
			
 
				-	.version = {1, 13, 2},
			
 
				+	.version = {1, 14, 0},
			
 
				 	.module	 = THIS_MODULE,
			
 
				 	.ctr	 = mirror_ctr,
			
 
				 	.dtr	 = mirror_dtr,
			
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -29,30 +29,37 @@ struct dm_stat_percpu {
 
				 	unsigned long long io_ticks[2];
			
 
				 	unsigned long long io_ticks_total;
			
 
				 	unsigned long long time_in_queue;
			
 
				+	unsigned long long *histogram;
			
 
				 };
			
 
				 
			
 
				 struct dm_stat_shared {
			
 
				 	atomic_t in_flight[2];
			
 
				-	unsigned long stamp;
			
 
				+	unsigned long long stamp;
			
 
				 	struct dm_stat_percpu tmp;
			
 
				 };
			
 
				 
			
 
				 struct dm_stat {
			
 
				 	struct list_head list_entry;
			
 
				 	int id;
			
 
				+	unsigned stat_flags;
			
 
				 	size_t n_entries;
			
 
				 	sector_t start;
			
 
				 	sector_t end;
			
 
				 	sector_t step;
			
 
				+	unsigned n_histogram_entries;
			
 
				+	unsigned long long *histogram_boundaries;
			
 
				 	const char *program_id;
			
 
				 	const char *aux_data;
			
 
				 	struct rcu_head rcu_head;
			
 
				 	size_t shared_alloc_size;
			
 
				 	size_t percpu_alloc_size;
			
 
				+	size_t histogram_alloc_size;
			
 
				 	struct dm_stat_percpu *stat_percpu[NR_CPUS];
			
 
				 	struct dm_stat_shared stat_shared[0];
			
 
				 };
			
 
				 
			
 
				+#define STAT_PRECISE_TIMESTAMPS		1
			
 
				+
			
 
				 struct dm_stats_last_position {
			
 
				 	sector_t last_sector;
			
 
				 	unsigned last_rw;
			
@@ -160,10 +167,7 @@ static void dm_kvfree(void *ptr, size_t alloc_size)
 
				 
			
 
				 	free_shared_memory(alloc_size);
			
 
				 
			
 
				-	if (is_vmalloc_addr(ptr))
			
 
				-		vfree(ptr);
			
 
				-	else
			
 
				-		kfree(ptr);
			
 
				+	kvfree(ptr);
			
 
				 }
			
 
				 
			
 
				 static void dm_stat_free(struct rcu_head *head)
			
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
 
				 
			
 
				 	kfree(s->program_id);
			
 
				 	kfree(s->aux_data);
			
 
				-	for_each_possible_cpu(cpu)
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
			
 
				 		dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
			
 
				+	}
			
 
				+	dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
			
 
				 	dm_kvfree(s, s->shared_alloc_size);
			
 
				 }
			
 
				 
			
@@ -227,7 +234,10 @@ void dm_stats_cleanup(struct dm_stats *stats)
 
				 }
			
 
				 
			
 
				 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
			
 
				-			   sector_t step, const char *program_id, const char *aux_data,
			
 
				+			   sector_t step, unsigned stat_flags,
			
 
				+			   unsigned n_histogram_entries,
			
 
				+			   unsigned long long *histogram_boundaries,
			
 
				+			   const char *program_id, const char *aux_data,
			
 
				 			   void (*suspend_callback)(struct mapped_device *),
			
 
				 			   void (*resume_callback)(struct mapped_device *),
			
 
				 			   struct mapped_device *md)
			
@@ -238,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 
				 	size_t ni;
			
 
				 	size_t shared_alloc_size;
			
 
				 	size_t percpu_alloc_size;
			
 
				+	size_t histogram_alloc_size;
			
 
				 	struct dm_stat_percpu *p;
			
 
				 	int cpu;
			
 
				 	int ret_id;
			
@@ -261,19 +272,34 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 
				 	if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
			
 
				 		return -EOVERFLOW;
			
 
				 
			
 
				-	if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
			
 
				+	histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
			
 
				+	if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
			
 
				+		return -EOVERFLOW;
			
 
				+
			
 
				+	if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
			
 
				+				 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
			
 
				 	if (!s)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+	s->stat_flags = stat_flags;
			
 
				 	s->n_entries = n_entries;
			
 
				 	s->start = start;
			
 
				 	s->end = end;
			
 
				 	s->step = step;
			
 
				 	s->shared_alloc_size = shared_alloc_size;
			
 
				 	s->percpu_alloc_size = percpu_alloc_size;
			
 
				+	s->histogram_alloc_size = histogram_alloc_size;
			
 
				+
			
 
				+	s->n_histogram_entries = n_histogram_entries;
			
 
				+	s->histogram_boundaries = kmemdup(histogram_boundaries,
			
 
				+					  s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
			
 
				+	if (!s->histogram_boundaries) {
			
 
				+		r = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	s->program_id = kstrdup(program_id, GFP_KERNEL);
			
 
				 	if (!s->program_id) {
			
@@ -291,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 
				 		atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
			
 
				 	}
			
 
				 
			
 
				+	if (s->n_histogram_entries) {
			
 
				+		unsigned long long *hi;
			
 
				+		hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
			
 
				+		if (!hi) {
			
 
				+			r = -ENOMEM;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		for (ni = 0; ni < n_entries; ni++) {
			
 
				+			s->stat_shared[ni].tmp.histogram = hi;
			
 
				+			hi += s->n_histogram_entries + 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	for_each_possible_cpu(cpu) {
			
 
				 		p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
			
 
				 		if (!p) {
			
@@ -298,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 
				 			goto out;
			
 
				 		}
			
 
				 		s->stat_percpu[cpu] = p;
			
 
				+		if (s->n_histogram_entries) {
			
 
				+			unsigned long long *hi;
			
 
				+			hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
			
 
				+			if (!hi) {
			
 
				+				r = -ENOMEM;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			for (ni = 0; ni < n_entries; ni++) {
			
 
				+				p[ni].histogram = hi;
			
 
				+				hi += s->n_histogram_entries + 1;
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -375,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
 
				 	 * vfree can't be called from RCU callback
			
 
				 	 */
			
 
				 	for_each_possible_cpu(cpu)
			
 
				-		if (is_vmalloc_addr(s->stat_percpu))
			
 
				+		if (is_vmalloc_addr(s->stat_percpu) ||
			
 
				+		    is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
			
 
				 			goto do_sync_free;
			
 
				-	if (is_vmalloc_addr(s)) {
			
 
				+	if (is_vmalloc_addr(s) ||
			
 
				+	    is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
			
 
				 do_sync_free:
			
 
				 		synchronize_rcu_expedited();
			
 
				 		dm_stat_free(&s->rcu_head);
			
@@ -417,18 +470,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
			
 
				+static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
			
 
				+			  struct dm_stat_percpu *p)
			
 
				 {
			
 
				 	/*
			
 
				 	 * This is racy, but so is part_round_stats_single.
			
 
				 	 */
			
 
				-	unsigned long now = jiffies;
			
 
				-	unsigned in_flight_read;
			
 
				-	unsigned in_flight_write;
			
 
				-	unsigned long difference = now - shared->stamp;
			
 
				+	unsigned long long now, difference;
			
 
				+	unsigned in_flight_read, in_flight_write;
			
 
				+
			
 
				+	if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
			
 
				+		now = jiffies;
			
 
				+	else
			
 
				+		now = ktime_to_ns(ktime_get());
			
 
				 
			
 
				+	difference = now - shared->stamp;
			
 
				 	if (!difference)
			
 
				 		return;
			
 
				+
			
 
				 	in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
			
 
				 	in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
			
 
				 	if (in_flight_read)
			
@@ -443,8 +502,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
 
				 }
			
 
				 
			
 
				 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
			
 
				-			      unsigned long bi_rw, sector_t len, bool merged,
			
 
				-			      bool end, unsigned long duration)
			
 
				+			      unsigned long bi_rw, sector_t len,
			
 
				+			      struct dm_stats_aux *stats_aux, bool end,
			
 
				+			      unsigned long duration_jiffies)
			
 
				 {
			
 
				 	unsigned long idx = bi_rw & REQ_WRITE;
			
 
				 	struct dm_stat_shared *shared = &s->stat_shared[entry];
			
@@ -474,15 +534,35 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 
				 	p = &s->stat_percpu[smp_processor_id()][entry];
			
 
				 
			
 
				 	if (!end) {
			
 
				-		dm_stat_round(shared, p);
			
 
				+		dm_stat_round(s, shared, p);
			
 
				 		atomic_inc(&shared->in_flight[idx]);
			
 
				 	} else {
			
 
				-		dm_stat_round(shared, p);
			
 
				+		unsigned long long duration;
			
 
				+		dm_stat_round(s, shared, p);
			
 
				 		atomic_dec(&shared->in_flight[idx]);
			
 
				 		p->sectors[idx] += len;
			
 
				 		p->ios[idx] += 1;
			
 
				-		p->merges[idx] += merged;
			
 
				-		p->ticks[idx] += duration;
			
 
				+		p->merges[idx] += stats_aux->merged;
			
 
				+		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
			
 
				+			p->ticks[idx] += duration_jiffies;
			
 
				+			duration = jiffies_to_msecs(duration_jiffies);
			
 
				+		} else {
			
 
				+			p->ticks[idx] += stats_aux->duration_ns;
			
 
				+			duration = stats_aux->duration_ns;
			
 
				+		}
			
 
				+		if (s->n_histogram_entries) {
			
 
				+			unsigned lo = 0, hi = s->n_histogram_entries + 1;
			
 
				+			while (lo + 1 < hi) {
			
 
				+				unsigned mid = (lo + hi) / 2;
			
 
				+				if (s->histogram_boundaries[mid - 1] > duration) {
			
 
				+					hi = mid;
			
 
				+				} else {
			
 
				+					lo = mid;
			
 
				+				}
			
 
				+
			
 
				+			}
			
 
				+			p->histogram[lo]++;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 #if BITS_PER_LONG == 32
			
@@ -494,7 +574,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 
				 
			
 
				 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
			
 
				 			  sector_t bi_sector, sector_t end_sector,
			
 
				-			  bool end, unsigned long duration,
			
 
				+			  bool end, unsigned long duration_jiffies,
			
 
				 			  struct dm_stats_aux *stats_aux)
			
 
				 {
			
 
				 	sector_t rel_sector, offset, todo, fragment_len;
			
@@ -523,7 +603,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 
				 		if (fragment_len > s->step - offset)
			
 
				 			fragment_len = s->step - offset;
			
 
				 		dm_stat_for_entry(s, entry, bi_rw, fragment_len,
			
 
				-				  stats_aux->merged, end, duration);
			
 
				+				  stats_aux, end, duration_jiffies);
			
 
				 		todo -= fragment_len;
			
 
				 		entry++;
			
 
				 		offset = 0;
			
@@ -532,11 +612,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 
				 
			
 
				 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
			
 
				 			 sector_t bi_sector, unsigned bi_sectors, bool end,
			
 
				-			 unsigned long duration, struct dm_stats_aux *stats_aux)
			
 
				+			 unsigned long duration_jiffies,
			
 
				+			 struct dm_stats_aux *stats_aux)
			
 
				 {
			
 
				 	struct dm_stat *s;
			
 
				 	sector_t end_sector;
			
 
				 	struct dm_stats_last_position *last;
			
 
				+	bool got_precise_time;
			
 
				 
			
 
				 	if (unlikely(!bi_sectors))
			
 
				 		return;
			
@@ -560,8 +642,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 
				 
			
 
				 	rcu_read_lock();
			
 
				 
			
 
				-	list_for_each_entry_rcu(s, &stats->list, list_entry)
			
 
				-		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
			
 
				+	got_precise_time = false;
			
 
				+	list_for_each_entry_rcu(s, &stats->list, list_entry) {
			
 
				+		if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
			
 
				+			if (!end)
			
 
				+				stats_aux->duration_ns = ktime_to_ns(ktime_get());
			
 
				+			else
			
 
				+				stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
			
 
				+			got_precise_time = true;
			
 
				+		}
			
 
				+		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
			
 
				+	}
			
 
				 
			
 
				 	rcu_read_unlock();
			
 
				 }
			
@@ -574,10 +665,25 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
				 
			
 
				 	local_irq_disable();
			
 
				 	p = &s->stat_percpu[smp_processor_id()][x];
			
 
				-	dm_stat_round(shared, p);
			
 
				+	dm_stat_round(s, shared, p);
			
 
				 	local_irq_enable();
			
 
				 
			
 
				-	memset(&shared->tmp, 0, sizeof(shared->tmp));
			
 
				+	shared->tmp.sectors[READ] = 0;
			
 
				+	shared->tmp.sectors[WRITE] = 0;
			
 
				+	shared->tmp.ios[READ] = 0;
			
 
				+	shared->tmp.ios[WRITE] = 0;
			
 
				+	shared->tmp.merges[READ] = 0;
			
 
				+	shared->tmp.merges[WRITE] = 0;
			
 
				+	shared->tmp.ticks[READ] = 0;
			
 
				+	shared->tmp.ticks[WRITE] = 0;
			
 
				+	shared->tmp.io_ticks[READ] = 0;
			
 
				+	shared->tmp.io_ticks[WRITE] = 0;
			
 
				+	shared->tmp.io_ticks_total = 0;
			
 
				+	shared->tmp.time_in_queue = 0;
			
 
				+
			
 
				+	if (s->n_histogram_entries)
			
 
				+		memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
			
 
				+
			
 
				 	for_each_possible_cpu(cpu) {
			
 
				 		p = &s->stat_percpu[cpu][x];
			
 
				 		shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
			
@@ -592,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
				 		shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
			
 
				 		shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
			
 
				 		shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
			
 
				+		if (s->n_histogram_entries) {
			
 
				+			unsigned i;
			
 
				+			for (i = 0; i < s->n_histogram_entries + 1; i++)
			
 
				+				shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -621,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
 
				 		p->io_ticks_total -= shared->tmp.io_ticks_total;
			
 
				 		p->time_in_queue -= shared->tmp.time_in_queue;
			
 
				 		local_irq_enable();
			
 
				+		if (s->n_histogram_entries) {
			
 
				+			unsigned i;
			
 
				+			for (i = 0; i < s->n_histogram_entries + 1; i++) {
			
 
				+				local_irq_disable();
			
 
				+				p = &s->stat_percpu[smp_processor_id()][x];
			
 
				+				p->histogram[i] -= shared->tmp.histogram[i];
			
 
				+				local_irq_enable();
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -646,11 +766,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
 
				 /*
			
 
				  * This is like jiffies_to_msec, but works for 64-bit values.
			
 
				  */
			
 
				-static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
			
 
				+static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
			
 
				 {
			
 
				-	unsigned long long result = 0;
			
 
				+	unsigned long long result;
			
 
				 	unsigned mult;
			
 
				 
			
 
				+	if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
			
 
				+		return j;
			
 
				+
			
 
				+	result = 0;
			
 
				 	if (j)
			
 
				 		result = jiffies_to_msecs(j & 0x3fffff);
			
 
				 	if (j >= 1 << 22) {
			
@@ -706,22 +830,29 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 
				 
			
 
				 		__dm_stat_init_temporary_percpu_totals(shared, s, x);
			
 
				 
			
 
				-		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
			
 
				+		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
			
 
				 		       (unsigned long long)start,
			
 
				 		       (unsigned long long)step,
			
 
				 		       shared->tmp.ios[READ],
			
 
				 		       shared->tmp.merges[READ],
			
 
				 		       shared->tmp.sectors[READ],
			
 
				-		       dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
			
 
				+		       dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
			
 
				 		       shared->tmp.ios[WRITE],
			
 
				 		       shared->tmp.merges[WRITE],
			
 
				 		       shared->tmp.sectors[WRITE],
			
 
				-		       dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
			
 
				+		       dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
			
 
				 		       dm_stat_in_flight(shared),
			
 
				-		       dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
			
 
				-		       dm_jiffies_to_msec64(shared->tmp.time_in_queue),
			
 
				-		       dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
			
 
				-		       dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
			
 
				+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
			
 
				+		       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
			
 
				+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
			
 
				+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
			
 
				+		if (s->n_histogram_entries) {
			
 
				+			unsigned i;
			
 
				+			for (i = 0; i < s->n_histogram_entries + 1; i++) {
			
 
				+				DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
			
 
				+			}
			
 
				+		}
			
 
				+		DMEMIT("\n");
			
 
				 
			
 
				 		if (unlikely(sz + 1 >= maxlen))
			
 
				 			goto buffer_overflow;
			
@@ -763,55 +894,134 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
			
 
				+			   unsigned long long **histogram_boundaries)
			
 
				+{
			
 
				+	const char *q;
			
 
				+	unsigned n;
			
 
				+	unsigned long long last;
			
 
				+
			
 
				+	*n_histogram_entries = 1;
			
 
				+	for (q = h; *q; q++)
			
 
				+		if (*q == ',')
			
 
				+			(*n_histogram_entries)++;
			
 
				+
			
 
				+	*histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
			
 
				+	if (!*histogram_boundaries)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	n = 0;
			
 
				+	last = 0;
			
 
				+	while (1) {
			
 
				+		unsigned long long hi;
			
 
				+		int s;
			
 
				+		char ch;
			
 
				+		s = sscanf(h, "%llu%c", &hi, &ch);
			
 
				+		if (!s || (s == 2 && ch != ','))
			
 
				+			return -EINVAL;
			
 
				+		if (hi <= last)
			
 
				+			return -EINVAL;
			
 
				+		last = hi;
			
 
				+		(*histogram_boundaries)[n] = hi;
			
 
				+		if (s == 1)
			
 
				+			return 0;
			
 
				+		h = strchr(h, ',') + 1;
			
 
				+		n++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int message_stats_create(struct mapped_device *md,
			
 
				 				unsigned argc, char **argv,
			
 
				 				char *result, unsigned maxlen)
			
 
				 {
			
 
				+	int r;
			
 
				 	int id;
			
 
				 	char dummy;
			
 
				 	unsigned long long start, end, len, step;
			
 
				 	unsigned divisor;
			
 
				 	const char *program_id, *aux_data;
			
 
				+	unsigned stat_flags = 0;
			
 
				+
			
 
				+	unsigned n_histogram_entries = 0;
			
 
				+	unsigned long long *histogram_boundaries = NULL;
			
 
				+
			
 
				+	struct dm_arg_set as, as_backup;
			
 
				+	const char *a;
			
 
				+	unsigned feature_args;
			
 
				 
			
 
				 	/*
			
 
				 	 * Input format:
			
 
				-	 *   <range> <step> [<program_id> [<aux_data>]]
			
 
				+	 *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
			
 
				 	 */
			
 
				 
			
 
				-	if (argc < 3 || argc > 5)
			
 
				-		return -EINVAL;
			
 
				+	if (argc < 3)
			
 
				+		goto ret_einval;
			
 
				 
			
 
				-	if (!strcmp(argv[1], "-")) {
			
 
				+	as.argc = argc;
			
 
				+	as.argv = argv;
			
 
				+	dm_consume_args(&as, 1);
			
 
				+
			
 
				+	a = dm_shift_arg(&as);
			
 
				+	if (!strcmp(a, "-")) {
			
 
				 		start = 0;
			
 
				 		len = dm_get_size(md);
			
 
				 		if (!len)
			
 
				 			len = 1;
			
 
				-	} else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
			
 
				+	} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
			
 
				 		   start != (sector_t)start || len != (sector_t)len)
			
 
				-		return -EINVAL;
			
 
				+		goto ret_einval;
			
 
				 
			
 
				 	end = start + len;
			
 
				 	if (start >= end)
			
 
				-		return -EINVAL;
			
 
				+		goto ret_einval;
			
 
				 
			
 
				-	if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
			
 
				+	a = dm_shift_arg(&as);
			
 
				+	if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
			
 
				+		if (!divisor)
			
 
				+			return -EINVAL;
			
 
				 		step = end - start;
			
 
				 		if (do_div(step, divisor))
			
 
				 			step++;
			
 
				 		if (!step)
			
 
				 			step = 1;
			
 
				-	} else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
			
 
				+	} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
			
 
				 		   step != (sector_t)step || !step)
			
 
				-		return -EINVAL;
			
 
				+		goto ret_einval;
			
 
				+
			
 
				+	as_backup = as;
			
 
				+	a = dm_shift_arg(&as);
			
 
				+	if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
			
 
				+		while (feature_args--) {
			
 
				+			a = dm_shift_arg(&as);
			
 
				+			if (!a)
			
 
				+				goto ret_einval;
			
 
				+			if (!strcasecmp(a, "precise_timestamps"))
			
 
				+				stat_flags |= STAT_PRECISE_TIMESTAMPS;
			
 
				+			else if (!strncasecmp(a, "histogram:", 10)) {
			
 
				+				if (n_histogram_entries)
			
 
				+					goto ret_einval;
			
 
				+				if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
			
 
				+					goto ret;
			
 
				+			} else
			
 
				+				goto ret_einval;
			
 
				+		}
			
 
				+	} else {
			
 
				+		as = as_backup;
			
 
				+	}
			
 
				 
			
 
				 	program_id = "-";
			
 
				 	aux_data = "-";
			
 
				 
			
 
				-	if (argc > 3)
			
 
				-		program_id = argv[3];
			
 
				+	a = dm_shift_arg(&as);
			
 
				+	if (a)
			
 
				+		program_id = a;
			
 
				 
			
 
				-	if (argc > 4)
			
 
				-		aux_data = argv[4];
			
 
				+	a = dm_shift_arg(&as);
			
 
				+	if (a)
			
 
				+		aux_data = a;
			
 
				+
			
 
				+	if (as.argc)
			
 
				+		goto ret_einval;
			
 
				 
			
 
				 	/*
			
 
				 	 * If a buffer overflow happens after we created the region,
			
@@ -820,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
 
				 	 * leaked).  So we must detect buffer overflow in advance.
			
 
				 	 */
			
 
				 	snprintf(result, maxlen, "%d", INT_MAX);
			
 
				-	if (dm_message_test_buffer_overflow(result, maxlen))
			
 
				-		return 1;
			
 
				+	if (dm_message_test_buffer_overflow(result, maxlen)) {
			
 
				+		r = 1;
			
 
				+		goto ret;
			
 
				+	}
			
 
				 
			
 
				-	id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
			
 
				+	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
			
 
				+			     n_histogram_entries, histogram_boundaries, program_id, aux_data,
			
 
				 			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
			
 
				-	if (id < 0)
			
 
				-		return id;
			
 
				+	if (id < 0) {
			
 
				+		r = id;
			
 
				+		goto ret;
			
 
				+	}
			
 
				 
			
 
				 	snprintf(result, maxlen, "%d", id);
			
 
				 
			
 
				-	return 1;
			
 
				+	r = 1;
			
 
				+	goto ret;
			
 
				+
			
 
				+ret_einval:
			
 
				+	r = -EINVAL;
			
 
				+ret:
			
 
				+	kfree(histogram_boundaries);
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				 static int message_stats_delete(struct mapped_device *md,
			
@@ -933,11 +1155,6 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				-	if (dm_request_based(md)) {
			
 
				-		DMWARN("Statistics are only supported for bio-based devices");
			
 
				-		return -EOPNOTSUPP;
			
 
				-	}
			
 
				-
			
 
				 	/* All messages here must start with '@' */
			
 
				 	if (!strcasecmp(argv[0], "@stats_create"))
			
 
				 		r = message_stats_create(md, argc, argv, result, maxlen);
			
--- a/drivers/md/dm-stats.h
+++ b/drivers/md/dm-stats.h
@@ -18,6 +18,7 @@ struct dm_stats {
 
				 
			
 
				 struct dm_stats_aux {
			
 
				 	bool merged;
			
 
				+	unsigned long long duration_ns;
			
 
				 };
			
 
				 
			
 
				 void dm_stats_init(struct dm_stats *st);
			
@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 
				 
			
 
				 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
			
 
				 			 sector_t bi_sector, unsigned bi_sectors, bool end,
			
 
				-			 unsigned long duration, struct dm_stats_aux *aux);
			
 
				+			 unsigned long duration_jiffies,
			
 
				+			 struct dm_stats_aux *aux);
			
 
				 
			
 
				 static inline bool dm_stats_used(struct dm_stats *st)
			
 
				 {
			
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -451,10 +451,8 @@ int __init dm_stripe_init(void)
 
				 	int r;
			
 
				 
			
 
				 	r = dm_register_target(&stripe_target);
			
 
				-	if (r < 0) {
			
 
				+	if (r < 0)
			
 
				 		DMWARN("target registration failed");
			
 
				-		return r;
			
 
				-	}
			
 
				 
			
 
				 	return r;
			
 
				 }
			
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -964,8 +964,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if (!t->mempools)
			
 
				-		return -ENOMEM;
			
 
				+	if (IS_ERR(t->mempools))
			
 
				+		return PTR_ERR(t->mempools);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -184,7 +184,6 @@ struct dm_pool_metadata {
 
				 	uint64_t trans_id;
			
 
				 	unsigned long flags;
			
 
				 	sector_t data_block_size;
			
 
				-	bool read_only:1;
			
 
				 
			
 
				 	/*
			
 
				 	 * Set if a transaction has to be aborted but the attempt to roll back
			
@@ -836,7 +835,6 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 
				 	init_rwsem(&pmd->root_lock);
			
 
				 	pmd->time = 0;
			
 
				 	INIT_LIST_HEAD(&pmd->thin_devices);
			
 
				-	pmd->read_only = false;
			
 
				 	pmd->fail_io = false;
			
 
				 	pmd->bdev = bdev;
			
 
				 	pmd->data_block_size = data_block_size;
			
@@ -880,7 +878,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 
				 		return -EBUSY;
			
 
				 	}
			
 
				 
			
 
				-	if (!pmd->read_only && !pmd->fail_io) {
			
 
				+	if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
			
 
				 		r = __commit_transaction(pmd);
			
 
				 		if (r < 0)
			
 
				 			DMWARN("%s: __commit_transaction() failed, error = %d",
			
@@ -1392,10 +1390,11 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 
				 	dm_block_t keys[2] = { td->id, block };
			
 
				 	struct dm_btree_info *info;
			
 
				 
			
 
				-	if (pmd->fail_io)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				 	down_read(&pmd->root_lock);
			
 
				+	if (pmd->fail_io) {
			
 
				+		up_read(&pmd->root_lock);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				 
			
 
				 	if (can_issue_io) {
			
 
				 		info = &pmd->info;
			
@@ -1419,6 +1418,63 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+/* FIXME: write a more efficient one in btree */
			
 
				+int dm_thin_find_mapped_range(struct dm_thin_device *td,
			
 
				+			      dm_block_t begin, dm_block_t end,
			
 
				+			      dm_block_t *thin_begin, dm_block_t *thin_end,
			
 
				+			      dm_block_t *pool_begin, bool *maybe_shared)
			
 
				+{
			
 
				+	int r;
			
 
				+	dm_block_t pool_end;
			
 
				+	struct dm_thin_lookup_result lookup;
			
 
				+
			
 
				+	if (end < begin)
			
 
				+		return -ENODATA;
			
 
				+
			
 
				+	/*
			
 
				+	 * Find first mapped block.
			
 
				+	 */
			
 
				+	while (begin < end) {
			
 
				+		r = dm_thin_find_block(td, begin, true, &lookup);
			
 
				+		if (r) {
			
 
				+			if (r != -ENODATA)
			
 
				+				return r;
			
 
				+		} else
			
 
				+			break;
			
 
				+
			
 
				+		begin++;
			
 
				+	}
			
 
				+
			
 
				+	if (begin == end)
			
 
				+		return -ENODATA;
			
 
				+
			
 
				+	*thin_begin = begin;
			
 
				+	*pool_begin = lookup.block;
			
 
				+	*maybe_shared = lookup.shared;
			
 
				+
			
 
				+	begin++;
			
 
				+	pool_end = *pool_begin + 1;
			
 
				+	while (begin != end) {
			
 
				+		r = dm_thin_find_block(td, begin, true, &lookup);
			
 
				+		if (r) {
			
 
				+			if (r == -ENODATA)
			
 
				+				break;
			
 
				+			else
			
 
				+				return r;
			
 
				+		}
			
 
				+
			
 
				+		if ((lookup.block != pool_end) ||
			
 
				+		    (lookup.shared != *maybe_shared))
			
 
				+			break;
			
 
				+
			
 
				+		pool_end++;
			
 
				+		begin++;
			
 
				+	}
			
 
				+
			
 
				+	*thin_end = begin;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int __insert(struct dm_thin_device *td, dm_block_t block,
			
 
				 		    dm_block_t data_block)
			
 
				 {
			
@@ -1471,6 +1527,47 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned count;
			
 
				+	struct dm_pool_metadata *pmd = td->pmd;
			
 
				+	dm_block_t keys[1] = { td->id };
			
 
				+	__le64 value;
			
 
				+	dm_block_t mapping_root;
			
 
				+
			
 
				+	/*
			
 
				+	 * Find the mapping tree
			
 
				+	 */
			
 
				+	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	/*
			
 
				+	 * Remove from the mapping tree, taking care to inc the
			
 
				+	 * ref count so it doesn't get deleted.
			
 
				+	 */
			
 
				+	mapping_root = le64_to_cpu(value);
			
 
				+	dm_tm_inc(pmd->tm, mapping_root);
			
 
				+	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	td->mapped_blocks -= count;
			
 
				+	td->changed = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Reinsert the mapping tree.
			
 
				+	 */
			
 
				+	value = cpu_to_le64(mapping_root);
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
			
 
				+}
			
 
				+
			
 
				 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
			
 
				 {
			
 
				 	int r = -EINVAL;
			
@@ -1483,6 +1580,19 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+int dm_thin_remove_range(struct dm_thin_device *td,
			
 
				+			 dm_block_t begin, dm_block_t end)
			
 
				+{
			
 
				+	int r = -EINVAL;
			
 
				+
			
 
				+	down_write(&td->pmd->root_lock);
			
 
				+	if (!td->pmd->fail_io)
			
 
				+		r = __remove_range(td, begin, end);
			
 
				+	up_write(&td->pmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
			
 
				 {
			
 
				 	int r;
			
@@ -1739,7 +1849,6 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
 
				 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
			
 
				 {
			
 
				 	down_write(&pmd->root_lock);
			
 
				-	pmd->read_only = true;
			
 
				 	dm_bm_set_read_only(pmd->bm);
			
 
				 	up_write(&pmd->root_lock);
			
 
				 }
			
@@ -1747,7 +1856,6 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 
				 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
			
 
				 {
			
 
				 	down_write(&pmd->root_lock);
			
 
				-	pmd->read_only = false;
			
 
				 	dm_bm_set_read_write(pmd->bm);
			
 
				 	up_write(&pmd->root_lock);
			
 
				 }
			
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -146,6 +146,15 @@ struct dm_thin_lookup_result {
 
				 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
			
 
				 		       int can_issue_io, struct dm_thin_lookup_result *result);
			
 
				 
			
 
				+/*
			
 
				+ * Retrieve the next run of contiguously mapped blocks.  Useful for working
			
 
				+ * out where to break up IO.  Returns 0 on success, < 0 on error.
			
 
				+ */
			
 
				+int dm_thin_find_mapped_range(struct dm_thin_device *td,
			
 
				+			      dm_block_t begin, dm_block_t end,
			
 
				+			      dm_block_t *thin_begin, dm_block_t *thin_end,
			
 
				+			      dm_block_t *pool_begin, bool *maybe_shared);
			
 
				+
			
 
				 /*
			
 
				  * Obtain an unused block.
			
 
				  */
			
@@ -158,6 +167,8 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 
				 			 dm_block_t data_block);
			
 
				 
			
 
				 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
			
 
				+int dm_thin_remove_range(struct dm_thin_device *td,
			
 
				+			 dm_block_t begin, dm_block_t end);
			
 
				 
			
 
				 /*
			
 
				  * Queries.
			
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -111,22 +111,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 
				 /*
			
 
				  * Key building.
			
 
				  */
			
 
				-static void build_data_key(struct dm_thin_device *td,
			
 
				-			   dm_block_t b, struct dm_cell_key *key)
			
 
				+enum lock_space {
			
 
				+	VIRTUAL,
			
 
				+	PHYSICAL
			
 
				+};
			
 
				+
			
 
				+static void build_key(struct dm_thin_device *td, enum lock_space ls,
			
 
				+		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
			
 
				 {
			
 
				-	key->virtual = 0;
			
 
				+	key->virtual = (ls == VIRTUAL);
			
 
				 	key->dev = dm_thin_dev_id(td);
			
 
				 	key->block_begin = b;
			
 
				-	key->block_end = b + 1ULL;
			
 
				+	key->block_end = e;
			
 
				+}
			
 
				+
			
 
				+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
			
 
				+			   struct dm_cell_key *key)
			
 
				+{
			
 
				+	build_key(td, PHYSICAL, b, b + 1llu, key);
			
 
				 }
			
 
				 
			
 
				 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
			
 
				 			      struct dm_cell_key *key)
			
 
				 {
			
 
				-	key->virtual = 1;
			
 
				-	key->dev = dm_thin_dev_id(td);
			
 
				-	key->block_begin = b;
			
 
				-	key->block_end = b + 1ULL;
			
 
				+	build_key(td, VIRTUAL, b, b + 1llu, key);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
@@ -312,6 +320,138 @@ struct thin_c {
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				+/**
			
 
				+ * __blkdev_issue_discard_async - queue a discard with async completion
			
 
				+ * @bdev:	blockdev to issue discard for
			
 
				+ * @sector:	start sector
			
 
				+ * @nr_sects:	number of sectors to discard
			
 
				+ * @gfp_mask:	memory allocation flags (for bio_alloc)
			
 
				+ * @flags:	BLKDEV_IFL_* flags to control behaviour
			
 
				+ * @parent_bio: parent discard bio that all sub discards get chained to
			
 
				+ *
			
 
				+ * Description:
			
 
				+ *    Asynchronously issue a discard request for the sectors in question.
			
 
				+ *    NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
			
 
				+ *    that is being kept local to DM thinp until the block changes to allow
			
 
				+ *    late bio splitting land upstream.
			
 
				+ */
			
 
				+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
			
 
				+					sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
			
 
				+					struct bio *parent_bio)
			
 
				+{
			
 
				+	struct request_queue *q = bdev_get_queue(bdev);
			
 
				+	int type = REQ_WRITE | REQ_DISCARD;
			
 
				+	unsigned int max_discard_sectors, granularity;
			
 
				+	int alignment;
			
 
				+	struct bio *bio;
			
 
				+	int ret = 0;
			
 
				+	struct blk_plug plug;
			
 
				+
			
 
				+	if (!q)
			
 
				+		return -ENXIO;
			
 
				+
			
 
				+	if (!blk_queue_discard(q))
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
			
 
				+	granularity = max(q->limits.discard_granularity >> 9, 1U);
			
 
				+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
			
 
				+
			
 
				+	/*
			
 
				+	 * Ensure that max_discard_sectors is of the proper
			
 
				+	 * granularity, so that requests stay aligned after a split.
			
 
				+	 */
			
 
				+	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
			
 
				+	max_discard_sectors -= max_discard_sectors % granularity;
			
 
				+	if (unlikely(!max_discard_sectors)) {
			
 
				+		/* Avoid infinite loop below. Being cautious never hurts. */
			
 
				+		return -EOPNOTSUPP;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & BLKDEV_DISCARD_SECURE) {
			
 
				+		if (!blk_queue_secdiscard(q))
			
 
				+			return -EOPNOTSUPP;
			
 
				+		type |= REQ_SECURE;
			
 
				+	}
			
 
				+
			
 
				+	blk_start_plug(&plug);
			
 
				+	while (nr_sects) {
			
 
				+		unsigned int req_sects;
			
 
				+		sector_t end_sect, tmp;
			
 
				+
			
 
				+		/*
			
 
				+		 * Required bio_put occurs in bio_endio thanks to bio_chain below
			
 
				+		 */
			
 
				+		bio = bio_alloc(gfp_mask, 1);
			
 
				+		if (!bio) {
			
 
				+			ret = -ENOMEM;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
			
 
				+
			
 
				+		/*
			
 
				+		 * If splitting a request, and the next starting sector would be
			
 
				+		 * misaligned, stop the discard at the previous aligned sector.
			
 
				+		 */
			
 
				+		end_sect = sector + req_sects;
			
 
				+		tmp = end_sect;
			
 
				+		if (req_sects < nr_sects &&
			
 
				+		    sector_div(tmp, granularity) != alignment) {
			
 
				+			end_sect = end_sect - alignment;
			
 
				+			sector_div(end_sect, granularity);
			
 
				+			end_sect = end_sect * granularity + alignment;
			
 
				+			req_sects = end_sect - sector;
			
 
				+		}
			
 
				+
			
 
				+		bio_chain(bio, parent_bio);
			
 
				+
			
 
				+		bio->bi_iter.bi_sector = sector;
			
 
				+		bio->bi_bdev = bdev;
			
 
				+
			
 
				+		bio->bi_iter.bi_size = req_sects << 9;
			
 
				+		nr_sects -= req_sects;
			
 
				+		sector = end_sect;
			
 
				+
			
 
				+		submit_bio(type, bio);
			
 
				+
			
 
				+		/*
			
 
				+		 * We can loop for a long time in here, if someone does
			
 
				+		 * full device discards (like mkfs). Be nice and allow
			
 
				+		 * us to schedule out to avoid softlocking if preempt
			
 
				+		 * is disabled.
			
 
				+		 */
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+	blk_finish_plug(&plug);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static bool block_size_is_power_of_two(struct pool *pool)
			
 
				+{
			
 
				+	return pool->sectors_per_block_shift >= 0;
			
 
				+}
			
 
				+
			
 
				+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
			
 
				+{
			
 
				+	return block_size_is_power_of_two(pool) ?
			
 
				+		(b << pool->sectors_per_block_shift) :
			
 
				+		(b * pool->sectors_per_block);
			
 
				+}
			
 
				+
			
 
				+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
			
 
				+			 struct bio *parent_bio)
			
 
				+{
			
 
				+	sector_t s = block_to_sectors(tc->pool, data_b);
			
 
				+	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
			
 
				+
			
 
				+	return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
			
 
				+					    GFP_NOWAIT, 0, parent_bio);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				 /*
			
 
				  * wake_worker() is used when new work is queued and when pool_resume is
			
 
				  * ready to continue deferred IO processing.
			
@@ -461,6 +601,7 @@ struct dm_thin_endio_hook {
 
				 	struct dm_deferred_entry *all_io_entry;
			
 
				 	struct dm_thin_new_mapping *overwrite_mapping;
			
 
				 	struct rb_node rb_node;
			
 
				+	struct dm_bio_prison_cell *cell;
			
 
				 };
			
 
				 
			
 
				 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
			
@@ -541,11 +682,6 @@ static void error_retry_list(struct pool *pool)
 
				  * target.
			
 
				  */
			
 
				 
			
 
				-static bool block_size_is_power_of_two(struct pool *pool)
			
 
				-{
			
 
				-	return pool->sectors_per_block_shift >= 0;
			
 
				-}
			
 
				-
			
 
				 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
			
 
				 {
			
 
				 	struct pool *pool = tc->pool;
			
@@ -559,6 +695,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 
				 	return block_nr;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Returns the _complete_ blocks that this bio covers.
			
 
				+ */
			
 
				+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
			
 
				+				dm_block_t *begin, dm_block_t *end)
			
 
				+{
			
 
				+	struct pool *pool = tc->pool;
			
 
				+	sector_t b = bio->bi_iter.bi_sector;
			
 
				+	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
			
 
				+
			
 
				+	b += pool->sectors_per_block - 1ull; /* so we round up */
			
 
				+
			
 
				+	if (block_size_is_power_of_two(pool)) {
			
 
				+		b >>= pool->sectors_per_block_shift;
			
 
				+		e >>= pool->sectors_per_block_shift;
			
 
				+	} else {
			
 
				+		(void) sector_div(b, pool->sectors_per_block);
			
 
				+		(void) sector_div(e, pool->sectors_per_block);
			
 
				+	}
			
 
				+
			
 
				+	if (e < b)
			
 
				+		/* Can happen if the bio is within a single block. */
			
 
				+		e = b;
			
 
				+
			
 
				+	*begin = b;
			
 
				+	*end = e;
			
 
				+}
			
 
				+
			
 
				 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
			
 
				 {
			
 
				 	struct pool *pool = tc->pool;
			
@@ -647,7 +811,7 @@ struct dm_thin_new_mapping {
 
				 	struct list_head list;
			
 
				 
			
 
				 	bool pass_discard:1;
			
 
				-	bool definitely_not_shared:1;
			
 
				+	bool maybe_shared:1;
			
 
				 
			
 
				 	/*
			
 
				 	 * Track quiescing, copying and zeroing preparation actions.  When this
			
@@ -658,9 +822,9 @@ struct dm_thin_new_mapping {
 
				 
			
 
				 	int err;
			
 
				 	struct thin_c *tc;
			
 
				-	dm_block_t virt_block;
			
 
				+	dm_block_t virt_begin, virt_end;
			
 
				 	dm_block_t data_block;
			
 
				-	struct dm_bio_prison_cell *cell, *cell2;
			
 
				+	struct dm_bio_prison_cell *cell;
			
 
				 
			
 
				 	/*
			
 
				 	 * If the bio covers the whole area of a block then we can avoid
			
@@ -705,6 +869,8 @@ static void overwrite_endio(struct bio *bio, int err)
 
				 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
			
 
				 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
			
 
				 
			
 
				+	bio->bi_end_io = m->saved_bi_end_io;
			
 
				+
			
 
				 	m->err = err;
			
 
				 	complete_mapping_preparation(m);
			
 
				 }
			
@@ -793,9 +959,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
				 
			
 
				 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
			
 
				 {
			
 
				-	if (m->bio)
			
 
				-		m->bio->bi_end_io = m->saved_bi_end_io;
			
 
				-
			
 
				 	cell_error(m->tc->pool, m->cell);
			
 
				 	list_del(&m->list);
			
 
				 	mempool_free(m, m->tc->pool->mapping_pool);
			
@@ -805,13 +968,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
				 {
			
 
				 	struct thin_c *tc = m->tc;
			
 
				 	struct pool *pool = tc->pool;
			
 
				-	struct bio *bio;
			
 
				+	struct bio *bio = m->bio;
			
 
				 	int r;
			
 
				 
			
 
				-	bio = m->bio;
			
 
				-	if (bio)
			
 
				-		bio->bi_end_io = m->saved_bi_end_io;
			
 
				-
			
 
				 	if (m->err) {
			
 
				 		cell_error(pool, m->cell);
			
 
				 		goto out;
			
@@ -822,7 +981,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
				 	 * Any I/O for this block arriving after this point will get
			
 
				 	 * remapped to it directly.
			
 
				 	 */
			
 
				-	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
			
 
				+	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
			
 
				 	if (r) {
			
 
				 		metadata_operation_failed(pool, "dm_thin_insert_block", r);
			
 
				 		cell_error(pool, m->cell);
			
@@ -849,50 +1008,112 @@ out:
 
				 	mempool_free(m, pool->mapping_pool);
			
 
				 }
			
 
				 
			
 
				-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static void free_discard_mapping(struct dm_thin_new_mapping *m)
			
 
				 {
			
 
				 	struct thin_c *tc = m->tc;
			
 
				+	if (m->cell)
			
 
				+		cell_defer_no_holder(tc, m->cell);
			
 
				+	mempool_free(m, tc->pool->mapping_pool);
			
 
				+}
			
 
				 
			
 
				+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
			
 
				+{
			
 
				 	bio_io_error(m->bio);
			
 
				+	free_discard_mapping(m);
			
 
				+}
			
 
				+
			
 
				+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
			
 
				+{
			
 
				+	bio_endio(m->bio, 0);
			
 
				+	free_discard_mapping(m);
			
 
				+}
			
 
				+
			
 
				+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct thin_c *tc = m->tc;
			
 
				+
			
 
				+	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
			
 
				+	if (r) {
			
 
				+		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
			
 
				+		bio_io_error(m->bio);
			
 
				+	} else
			
 
				+		bio_endio(m->bio, 0);
			
 
				+
			
 
				 	cell_defer_no_holder(tc, m->cell);
			
 
				-	cell_defer_no_holder(tc, m->cell2);
			
 
				 	mempool_free(m, tc->pool->mapping_pool);
			
 
				 }
			
 
				 
			
 
				-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
			
 
				+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
			
 
				 {
			
 
				+	/*
			
 
				+	 * We've already unmapped this range of blocks, but before we
			
 
				+	 * passdown we have to check that these blocks are now unused.
			
 
				+	 */
			
 
				+	int r;
			
 
				+	bool used = true;
			
 
				 	struct thin_c *tc = m->tc;
			
 
				+	struct pool *pool = tc->pool;
			
 
				+	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
			
 
				 
			
 
				-	inc_all_io_entry(tc->pool, m->bio);
			
 
				-	cell_defer_no_holder(tc, m->cell);
			
 
				-	cell_defer_no_holder(tc, m->cell2);
			
 
				+	while (b != end) {
			
 
				+		/* find start of unmapped run */
			
 
				+		for (; b < end; b++) {
			
 
				+			r = dm_pool_block_is_used(pool->pmd, b, &used);
			
 
				+			if (r)
			
 
				+				return r;
			
 
				 
			
 
				-	if (m->pass_discard)
			
 
				-		if (m->definitely_not_shared)
			
 
				-			remap_and_issue(tc, m->bio, m->data_block);
			
 
				-		else {
			
 
				-			bool used = false;
			
 
				-			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
			
 
				-				bio_endio(m->bio, 0);
			
 
				-			else
			
 
				-				remap_and_issue(tc, m->bio, m->data_block);
			
 
				+			if (!used)
			
 
				+				break;
			
 
				 		}
			
 
				-	else
			
 
				-		bio_endio(m->bio, 0);
			
 
				 
			
 
				-	mempool_free(m, tc->pool->mapping_pool);
			
 
				+		if (b == end)
			
 
				+			break;
			
 
				+
			
 
				+		/* find end of run */
			
 
				+		for (e = b + 1; e != end; e++) {
			
 
				+			r = dm_pool_block_is_used(pool->pmd, e, &used);
			
 
				+			if (r)
			
 
				+				return r;
			
 
				+
			
 
				+			if (used)
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		r = issue_discard(tc, b, e, m->bio);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+
			
 
				+		b = e;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void process_prepared_discard(struct dm_thin_new_mapping *m)
			
 
				+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
			
 
				 {
			
 
				 	int r;
			
 
				 	struct thin_c *tc = m->tc;
			
 
				+	struct pool *pool = tc->pool;
			
 
				 
			
 
				-	r = dm_thin_remove_block(tc->td, m->virt_block);
			
 
				+	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
			
 
				 	if (r)
			
 
				-		DMERR_LIMIT("dm_thin_remove_block() failed");
			
 
				+		metadata_operation_failed(pool, "dm_thin_remove_range", r);
			
 
				+
			
 
				+	else if (m->maybe_shared)
			
 
				+		r = passdown_double_checking_shared_status(m);
			
 
				+	else
			
 
				+		r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
			
 
				 
			
 
				-	process_prepared_discard_passdown(m);
			
 
				+	/*
			
 
				+	 * Even if r is set, there could be sub discards in flight that we
			
 
				+	 * need to wait for.
			
 
				+	 */
			
 
				+	bio_endio(m->bio, r);
			
 
				+	cell_defer_no_holder(tc, m->cell);
			
 
				+	mempool_free(m, pool->mapping_pool);
			
 
				 }
			
 
				 
			
 
				 static void process_prepared(struct pool *pool, struct list_head *head,
			
@@ -976,7 +1197,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 
				 }
			
 
				 
			
 
				 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
			
 
				-				      dm_block_t data_block,
			
 
				+				      dm_block_t data_begin,
			
 
				 				      struct dm_thin_new_mapping *m)
			
 
				 {
			
 
				 	struct pool *pool = tc->pool;
			
@@ -986,7 +1207,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
 
				 	m->bio = bio;
			
 
				 	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
			
 
				 	inc_all_io_entry(pool, bio);
			
 
				-	remap_and_issue(tc, bio, data_block);
			
 
				+	remap_and_issue(tc, bio, data_begin);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1003,7 +1224,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 
				 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
			
 
				 
			
 
				 	m->tc = tc;
			
 
				-	m->virt_block = virt_block;
			
 
				+	m->virt_begin = virt_block;
			
 
				+	m->virt_end = virt_block + 1u;
			
 
				 	m->data_block = data_dest;
			
 
				 	m->cell = cell;
			
 
				 
			
@@ -1082,7 +1304,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 
				 
			
 
				 	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
			
 
				 	m->tc = tc;
			
 
				-	m->virt_block = virt_block;
			
 
				+	m->virt_begin = virt_block;
			
 
				+	m->virt_end = virt_block + 1u;
			
 
				 	m->data_block = data_block;
			
 
				 	m->cell = cell;
			
 
				 
			
@@ -1091,16 +1314,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 
				 	 * zeroing pre-existing data, we can issue the bio immediately.
			
 
				 	 * Otherwise we use kcopyd to zero the data first.
			
 
				 	 */
			
 
				-	if (!pool->pf.zero_new_blocks)
			
 
				+	if (pool->pf.zero_new_blocks) {
			
 
				+		if (io_overwrites_block(pool, bio))
			
 
				+			remap_and_issue_overwrite(tc, bio, data_block, m);
			
 
				+		else
			
 
				+			ll_zero(tc, m, data_block * pool->sectors_per_block,
			
 
				+				(data_block + 1) * pool->sectors_per_block);
			
 
				+	} else
			
 
				 		process_prepared_mapping(m);
			
 
				-
			
 
				-	else if (io_overwrites_block(pool, bio))
			
 
				-		remap_and_issue_overwrite(tc, bio, data_block, m);
			
 
				-
			
 
				-	else
			
 
				-		ll_zero(tc, m,
			
 
				-			data_block * pool->sectors_per_block,
			
 
				-			(data_block + 1) * pool->sectors_per_block);
			
 
				 }
			
 
				 
			
 
				 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
			
@@ -1291,99 +1512,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 
				 		retry_on_resume(bio);
			
 
				 }
			
 
				 
			
 
				-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
			
 
				+static void process_discard_cell_no_passdown(struct thin_c *tc,
			
 
				+					     struct dm_bio_prison_cell *virt_cell)
			
 
				 {
			
 
				-	int r;
			
 
				-	struct bio *bio = cell->holder;
			
 
				 	struct pool *pool = tc->pool;
			
 
				-	struct dm_bio_prison_cell *cell2;
			
 
				-	struct dm_cell_key key2;
			
 
				-	dm_block_t block = get_bio_block(tc, bio);
			
 
				-	struct dm_thin_lookup_result lookup_result;
			
 
				-	struct dm_thin_new_mapping *m;
			
 
				+	struct dm_thin_new_mapping *m = get_next_mapping(pool);
			
 
				 
			
 
				-	if (tc->requeue_mode) {
			
 
				-		cell_requeue(pool, cell);
			
 
				-		return;
			
 
				-	}
			
 
				+	/*
			
 
				+	 * We don't need to lock the data blocks, since there's no
			
 
				+	 * passdown.  We only lock data blocks for allocation and breaking sharing.
			
 
				+	 */
			
 
				+	m->tc = tc;
			
 
				+	m->virt_begin = virt_cell->key.block_begin;
			
 
				+	m->virt_end = virt_cell->key.block_end;
			
 
				+	m->cell = virt_cell;
			
 
				+	m->bio = virt_cell->holder;
			
 
				 
			
 
				-	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
			
 
				-	switch (r) {
			
 
				-	case 0:
			
 
				-		/*
			
 
				-		 * Check nobody is fiddling with this pool block.  This can
			
 
				-		 * happen if someone's in the process of breaking sharing
			
 
				-		 * on this block.
			
 
				-		 */
			
 
				-		build_data_key(tc->td, lookup_result.block, &key2);
			
 
				-		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
			
 
				-			cell_defer_no_holder(tc, cell);
			
 
				-			break;
			
 
				-		}
			
 
				+	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
			
 
				+		pool->process_prepared_discard(m);
			
 
				+}
			
 
				 
			
 
				-		if (io_overlaps_block(pool, bio)) {
			
 
				-			/*
			
 
				-			 * IO may still be going to the destination block.  We must
			
 
				-			 * quiesce before we can do the removal.
			
 
				-			 */
			
 
				-			m = get_next_mapping(pool);
			
 
				-			m->tc = tc;
			
 
				-			m->pass_discard = pool->pf.discard_passdown;
			
 
				-			m->definitely_not_shared = !lookup_result.shared;
			
 
				-			m->virt_block = block;
			
 
				-			m->data_block = lookup_result.block;
			
 
				-			m->cell = cell;
			
 
				-			m->cell2 = cell2;
			
 
				-			m->bio = bio;
			
 
				-
			
 
				-			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
			
 
				-				pool->process_prepared_discard(m);
			
 
				+/*
			
 
				+ * FIXME: DM local hack to defer parent bios's end_io until we
			
 
				+ * _know_ all chained sub range discard bios have completed.
			
 
				+ * Will go away once late bio splitting lands upstream!
			
 
				+ */
			
 
				+static inline void __bio_inc_remaining(struct bio *bio)
			
 
				+{
			
 
				+	bio->bi_flags |= (1 << BIO_CHAIN);
			
 
				+	smp_mb__before_atomic();
			
 
				+	atomic_inc(&bio->__bi_remaining);
			
 
				+}
			
 
				 
			
 
				-		} else {
			
 
				-			inc_all_io_entry(pool, bio);
			
 
				-			cell_defer_no_holder(tc, cell);
			
 
				-			cell_defer_no_holder(tc, cell2);
			
 
				+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
			
 
				+				 struct bio *bio)
			
 
				+{
			
 
				+	struct pool *pool = tc->pool;
			
 
				 
			
 
				+	int r;
			
 
				+	bool maybe_shared;
			
 
				+	struct dm_cell_key data_key;
			
 
				+	struct dm_bio_prison_cell *data_cell;
			
 
				+	struct dm_thin_new_mapping *m;
			
 
				+	dm_block_t virt_begin, virt_end, data_begin;
			
 
				+
			
 
				+	while (begin != end) {
			
 
				+		r = ensure_next_mapping(pool);
			
 
				+		if (r)
			
 
				+			/* we did our best */
			
 
				+			return;
			
 
				+
			
 
				+		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
			
 
				+					      &data_begin, &maybe_shared);
			
 
				+		if (r)
			
 
				 			/*
			
 
				-			 * The DM core makes sure that the discard doesn't span
			
 
				-			 * a block boundary.  So we submit the discard of a
			
 
				-			 * partial block appropriately.
			
 
				+			 * Silently fail, letting any mappings we've
			
 
				+			 * created complete.
			
 
				 			 */
			
 
				-			if ((!lookup_result.shared) && pool->pf.discard_passdown)
			
 
				-				remap_and_issue(tc, bio, lookup_result.block);
			
 
				-			else
			
 
				-				bio_endio(bio, 0);
			
 
				+			break;
			
 
				+
			
 
				+		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
			
 
				+		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
			
 
				+			/* contention, we'll give up with this range */
			
 
				+			begin = virt_end;
			
 
				+			continue;
			
 
				 		}
			
 
				-		break;
			
 
				 
			
 
				-	case -ENODATA:
			
 
				 		/*
			
 
				-		 * It isn't provisioned, just forget it.
			
 
				+		 * IO may still be going to the destination block.  We must
			
 
				+		 * quiesce before we can do the removal.
			
 
				 		 */
			
 
				-		cell_defer_no_holder(tc, cell);
			
 
				-		bio_endio(bio, 0);
			
 
				-		break;
			
 
				+		m = get_next_mapping(pool);
			
 
				+		m->tc = tc;
			
 
				+		m->maybe_shared = maybe_shared;
			
 
				+		m->virt_begin = virt_begin;
			
 
				+		m->virt_end = virt_end;
			
 
				+		m->data_block = data_begin;
			
 
				+		m->cell = data_cell;
			
 
				+		m->bio = bio;
			
 
				 
			
 
				-	default:
			
 
				-		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
			
 
				-			    __func__, r);
			
 
				-		cell_defer_no_holder(tc, cell);
			
 
				-		bio_io_error(bio);
			
 
				-		break;
			
 
				+		/*
			
 
				+		 * The parent bio must not complete before sub discard bios are
			
 
				+		 * chained to it (see __blkdev_issue_discard_async's bio_chain)!
			
 
				+		 *
			
 
				+		 * This per-mapping bi_remaining increment is paired with
			
 
				+		 * the implicit decrement that occurs via bio_endio() in
			
 
				+		 * process_prepared_discard_{passdown,no_passdown}.
			
 
				+		 */
			
 
				+		__bio_inc_remaining(bio);
			
 
				+		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
			
 
				+			pool->process_prepared_discard(m);
			
 
				+
			
 
				+		begin = virt_end;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
			
 
				+{
			
 
				+	struct bio *bio = virt_cell->holder;
			
 
				+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
			
 
				+
			
 
				+	/*
			
 
				+	 * The virt_cell will only get freed once the origin bio completes.
			
 
				+	 * This means it will remain locked while all the individual
			
 
				+	 * passdown bios are in flight.
			
 
				+	 */
			
 
				+	h->cell = virt_cell;
			
 
				+	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
			
 
				+
			
 
				+	/*
			
 
				+	 * We complete the bio now, knowing that the bi_remaining field
			
 
				+	 * will prevent completion until the sub range discards have
			
 
				+	 * completed.
			
 
				+	 */
			
 
				+	bio_endio(bio, 0);
			
 
				+}
			
 
				+
			
 
				 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
			
 
				 {
			
 
				-	struct dm_bio_prison_cell *cell;
			
 
				-	struct dm_cell_key key;
			
 
				-	dm_block_t block = get_bio_block(tc, bio);
			
 
				+	dm_block_t begin, end;
			
 
				+	struct dm_cell_key virt_key;
			
 
				+	struct dm_bio_prison_cell *virt_cell;
			
 
				 
			
 
				-	build_virtual_key(tc->td, block, &key);
			
 
				-	if (bio_detain(tc->pool, &key, bio, &cell))
			
 
				+	get_bio_block_range(tc, bio, &begin, &end);
			
 
				+	if (begin == end) {
			
 
				+		/*
			
 
				+		 * The discard covers less than a block.
			
 
				+		 */
			
 
				+		bio_endio(bio, 0);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
			
 
				+	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
			
 
				+		/*
			
 
				+		 * Potential starvation issue: We're relying on the
			
 
				+		 * fs/application being well behaved, and not trying to
			
 
				+		 * send IO to a region at the same time as discarding it.
			
 
				+		 * If they do this persistently then it's possible this
			
 
				+		 * cell will never be granted.
			
 
				+		 */
			
 
				 		return;
			
 
				 
			
 
				-	process_discard_cell(tc, cell);
			
 
				+	tc->pool->process_discard_cell(tc, virt_cell);
			
 
				 }
			
 
				 
			
 
				 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
			
@@ -2099,6 +2370,24 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 
				 	       dm_device_name(pool->pool_md), new_mode);
			
 
				 }
			
 
				 
			
 
				+static bool passdown_enabled(struct pool_c *pt)
			
 
				+{
			
 
				+	return pt->adjusted_pf.discard_passdown;
			
 
				+}
			
 
				+
			
 
				+static void set_discard_callbacks(struct pool *pool)
			
 
				+{
			
 
				+	struct pool_c *pt = pool->ti->private;
			
 
				+
			
 
				+	if (passdown_enabled(pt)) {
			
 
				+		pool->process_discard_cell = process_discard_cell_passdown;
			
 
				+		pool->process_prepared_discard = process_prepared_discard_passdown;
			
 
				+	} else {
			
 
				+		pool->process_discard_cell = process_discard_cell_no_passdown;
			
 
				+		pool->process_prepared_discard = process_prepared_discard_no_passdown;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
			
 
				 {
			
 
				 	struct pool_c *pt = pool->ti->private;
			
@@ -2150,7 +2439,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 
				 		pool->process_cell = process_cell_read_only;
			
 
				 		pool->process_discard_cell = process_cell_success;
			
 
				 		pool->process_prepared_mapping = process_prepared_mapping_fail;
			
 
				-		pool->process_prepared_discard = process_prepared_discard_passdown;
			
 
				+		pool->process_prepared_discard = process_prepared_discard_success;
			
 
				 
			
 
				 		error_retry_list(pool);
			
 
				 		break;
			
@@ -2169,9 +2458,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 
				 		pool->process_bio = process_bio_read_only;
			
 
				 		pool->process_discard = process_discard_bio;
			
 
				 		pool->process_cell = process_cell_read_only;
			
 
				-		pool->process_discard_cell = process_discard_cell;
			
 
				 		pool->process_prepared_mapping = process_prepared_mapping;
			
 
				-		pool->process_prepared_discard = process_prepared_discard;
			
 
				+		set_discard_callbacks(pool);
			
 
				 
			
 
				 		if (!pool->pf.error_if_no_space && no_space_timeout)
			
 
				 			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
			
@@ -2184,9 +2472,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 
				 		pool->process_bio = process_bio;
			
 
				 		pool->process_discard = process_discard_bio;
			
 
				 		pool->process_cell = process_cell;
			
 
				-		pool->process_discard_cell = process_discard_cell;
			
 
				 		pool->process_prepared_mapping = process_prepared_mapping;
			
 
				-		pool->process_prepared_discard = process_prepared_discard;
			
 
				+		set_discard_callbacks(pool);
			
 
				 		break;
			
 
				 	}
			
 
				 
			
@@ -2275,6 +2562,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 
				 	h->shared_read_entry = NULL;
			
 
				 	h->all_io_entry = NULL;
			
 
				 	h->overwrite_mapping = NULL;
			
 
				+	h->cell = NULL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2422,7 +2710,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 
				 	struct pool *pool = pt->pool;
			
 
				 	struct block_device *data_bdev = pt->data_dev->bdev;
			
 
				 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
			
 
				-	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
			
 
				 	const char *reason = NULL;
			
 
				 	char buf[BDEVNAME_SIZE];
			
 
				 
			
@@ -2435,12 +2722,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 
				 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
			
 
				 		reason = "max discard sectors smaller than a block";
			
 
				 
			
 
				-	else if (data_limits->discard_granularity > block_size)
			
 
				-		reason = "discard granularity larger than a block";
			
 
				-
			
 
				-	else if (!is_factor(block_size, data_limits->discard_granularity))
			
 
				-		reason = "discard granularity not a factor of block size";
			
 
				-
			
 
				 	if (reason) {
			
 
				 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
			
 
				 		pt->adjusted_pf.discard_passdown = false;
			
@@ -3375,7 +3656,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 
				 	if (get_pool_mode(pool) >= PM_READ_ONLY) {
			
 
				 		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
			
 
				 		      dm_device_name(pool->pool_md));
			
 
				-		return -EINVAL;
			
 
				+		return -EOPNOTSUPP;
			
 
				 	}
			
 
				 
			
 
				 	if (!strcasecmp(argv[0], "create_thin"))
			
@@ -3573,24 +3854,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
				 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
			
 
				 }
			
 
				 
			
 
				-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
			
 
				-{
			
 
				-	struct pool *pool = pt->pool;
			
 
				-	struct queue_limits *data_limits;
			
 
				-
			
 
				-	limits->max_discard_sectors = pool->sectors_per_block;
			
 
				-
			
 
				-	/*
			
 
				-	 * discard_granularity is just a hint, and not enforced.
			
 
				-	 */
			
 
				-	if (pt->adjusted_pf.discard_passdown) {
			
 
				-		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
			
 
				-		limits->discard_granularity = max(data_limits->discard_granularity,
			
 
				-						  pool->sectors_per_block << SECTOR_SHIFT);
			
 
				-	} else
			
 
				-		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
			
 
				-}
			
 
				-
			
 
				 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
 
				 {
			
 
				 	struct pool_c *pt = ti->private;
			
@@ -3645,14 +3908,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
				 
			
 
				 	disable_passdown_if_not_supported(pt);
			
 
				 
			
 
				-	set_discard_limits(pt, limits);
			
 
				+	/*
			
 
				+	 * The pool uses the same discard limits as the underlying data
			
 
				+	 * device.  DM core has already set this up.
			
 
				+	 */
			
 
				 }
			
 
				 
			
 
				 static struct target_type pool_target = {
			
 
				 	.name = "thin-pool",
			
 
				 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
			
 
				 		    DM_TARGET_IMMUTABLE,
			
 
				-	.version = {1, 14, 0},
			
 
				+	.version = {1, 15, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = pool_ctr,
			
 
				 	.dtr = pool_dtr,
			
@@ -3811,8 +4077,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	if (tc->pool->pf.discard_enabled) {
			
 
				 		ti->discards_supported = true;
			
 
				 		ti->num_discard_bios = 1;
			
 
				-		/* Discard bios must be split on a block boundary */
			
 
				-		ti->split_discard_bios = true;
			
 
				+		ti->split_discard_bios = false;
			
 
				 	}
			
 
				 
			
 
				 	mutex_unlock(&dm_thin_pool_table.mutex);
			
@@ -3899,6 +4164,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (h->cell)
			
 
				+		cell_defer_no_holder(h->tc, h->cell);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -4026,9 +4294,18 @@ static int thin_iterate_devices(struct dm_target *ti,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
 
				+{
			
 
				+	struct thin_c *tc = ti->private;
			
 
				+	struct pool *pool = tc->pool;
			
 
				+
			
 
				+	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
			
 
				+	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
			
 
				+}
			
 
				+
			
 
				 static struct target_type thin_target = {
			
 
				 	.name = "thin",
			
 
				-	.version = {1, 14, 0},
			
 
				+	.version = {1, 15, 0},
			
 
				 	.module	= THIS_MODULE,
			
 
				 	.ctr = thin_ctr,
			
 
				 	.dtr = thin_dtr,
			
@@ -4040,6 +4317,7 @@ static struct target_type thin_target = {
 
				 	.status = thin_status,
			
 
				 	.merge = thin_merge,
			
 
				 	.iterate_devices = thin_iterate_devices,
			
 
				+	.io_hints = thin_io_hints,
			
 
				 };
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,6 +86,9 @@ struct dm_rq_target_io {
 
				 	struct kthread_work work;
			
 
				 	int error;
			
 
				 	union map_info info;
			
 
				+	struct dm_stats_aux stats_aux;
			
 
				+	unsigned long duration_jiffies;
			
 
				+	unsigned n_sectors;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -995,6 +998,17 @@ static struct dm_rq_target_io *tio_from_request(struct request *rq)
 
				 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
			
 
				 }
			
 
				 
			
 
				+static void rq_end_stats(struct mapped_device *md, struct request *orig)
			
 
				+{
			
 
				+	if (unlikely(dm_stats_used(&md->stats))) {
			
 
				+		struct dm_rq_target_io *tio = tio_from_request(orig);
			
 
				+		tio->duration_jiffies = jiffies - tio->duration_jiffies;
			
 
				+		dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
			
 
				+				    tio->n_sectors, true, tio->duration_jiffies,
			
 
				+				    &tio->stats_aux);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Don't touch any member of the md after calling this function because
			
 
				  * the md may be freed in dm_put() at the end of this function.
			
@@ -1078,6 +1092,7 @@ static void dm_end_request(struct request *clone, int error)
 
				 	}
			
 
				 
			
 
				 	free_rq_clone(clone);
			
 
				+	rq_end_stats(md, rq);
			
 
				 	if (!rq->q->mq_ops)
			
 
				 		blk_end_request_all(rq, error);
			
 
				 	else
			
@@ -1113,13 +1128,14 @@ static void old_requeue_request(struct request *rq)
 
				 	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				 }
			
 
				 
			
 
				-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
			
 
				-						 struct request *rq)
			
 
				+static void dm_requeue_original_request(struct mapped_device *md,
			
 
				+					struct request *rq)
			
 
				 {
			
 
				 	int rw = rq_data_dir(rq);
			
 
				 
			
 
				 	dm_unprep_request(rq);
			
 
				 
			
 
				+	rq_end_stats(md, rq);
			
 
				 	if (!rq->q->mq_ops)
			
 
				 		old_requeue_request(rq);
			
 
				 	else {
			
@@ -1130,13 +1146,6 @@ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
 
				 	rq_completed(md, rw, false);
			
 
				 }
			
 
				 
			
 
				-static void dm_requeue_unmapped_request(struct request *clone)
			
 
				-{
			
 
				-	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				-
			
 
				-	dm_requeue_unmapped_original_request(tio->md, tio->orig);
			
 
				-}
			
 
				-
			
 
				 static void old_stop_queue(struct request_queue *q)
			
 
				 {
			
 
				 	unsigned long flags;
			
@@ -1200,7 +1209,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 
				 		return;
			
 
				 	else if (r == DM_ENDIO_REQUEUE)
			
 
				 		/* The target wants to requeue the I/O */
			
 
				-		dm_requeue_unmapped_request(clone);
			
 
				+		dm_requeue_original_request(tio->md, tio->orig);
			
 
				 	else {
			
 
				 		DMWARN("unimplemented target endio return value: %d", r);
			
 
				 		BUG();
			
@@ -1218,6 +1227,7 @@ static void dm_softirq_done(struct request *rq)
 
				 	int rw;
			
 
				 
			
 
				 	if (!clone) {
			
 
				+		rq_end_stats(tio->md, rq);
			
 
				 		rw = rq_data_dir(rq);
			
 
				 		if (!rq->q->mq_ops) {
			
 
				 			blk_end_request_all(rq, tio->error);
			
@@ -1910,7 +1920,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 
				 		break;
			
 
				 	case DM_MAPIO_REQUEUE:
			
 
				 		/* The target wants to requeue the I/O */
			
 
				-		dm_requeue_unmapped_request(clone);
			
 
				+		dm_requeue_original_request(md, tio->orig);
			
 
				 		break;
			
 
				 	default:
			
 
				 		if (r > 0) {
			
@@ -1933,7 +1943,7 @@ static void map_tio_request(struct kthread_work *work)
 
				 	struct mapped_device *md = tio->md;
			
 
				 
			
 
				 	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
			
 
				-		dm_requeue_unmapped_original_request(md, rq);
			
 
				+		dm_requeue_original_request(md, rq);
			
 
				 }
			
 
				 
			
 
				 static void dm_start_request(struct mapped_device *md, struct request *orig)
			
@@ -1950,6 +1960,14 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 
				 		md->last_rq_start_time = ktime_get();
			
 
				 	}
			
 
				 
			
 
				+	if (unlikely(dm_stats_used(&md->stats))) {
			
 
				+		struct dm_rq_target_io *tio = tio_from_request(orig);
			
 
				+		tio->duration_jiffies = jiffies;
			
 
				+		tio->n_sectors = blk_rq_sectors(orig);
			
 
				+		dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
			
 
				+				    tio->n_sectors, false, 0, &tio->stats_aux);
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Hold the md reference here for the in-flight I/O.
			
 
				 	 * We can't rely on the reference count by device opener,
			
@@ -2173,6 +2191,40 @@ static void dm_init_old_md_queue(struct mapped_device *md)
 
				 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
			
 
				 }
			
 
				 
			
 
				+static void cleanup_mapped_device(struct mapped_device *md)
			
 
				+{
			
 
				+	cleanup_srcu_struct(&md->io_barrier);
			
 
				+
			
 
				+	if (md->wq)
			
 
				+		destroy_workqueue(md->wq);
			
 
				+	if (md->kworker_task)
			
 
				+		kthread_stop(md->kworker_task);
			
 
				+	if (md->io_pool)
			
 
				+		mempool_destroy(md->io_pool);
			
 
				+	if (md->rq_pool)
			
 
				+		mempool_destroy(md->rq_pool);
			
 
				+	if (md->bs)
			
 
				+		bioset_free(md->bs);
			
 
				+
			
 
				+	if (md->disk) {
			
 
				+		spin_lock(&_minor_lock);
			
 
				+		md->disk->private_data = NULL;
			
 
				+		spin_unlock(&_minor_lock);
			
 
				+		if (blk_get_integrity(md->disk))
			
 
				+			blk_integrity_unregister(md->disk);
			
 
				+		del_gendisk(md->disk);
			
 
				+		put_disk(md->disk);
			
 
				+	}
			
 
				+
			
 
				+	if (md->queue)
			
 
				+		blk_cleanup_queue(md->queue);
			
 
				+
			
 
				+	if (md->bdev) {
			
 
				+		bdput(md->bdev);
			
 
				+		md->bdev = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Allocate and initialise a blank device with a given minor.
			
 
				  */
			
@@ -2218,13 +2270,13 @@ static struct mapped_device *alloc_dev(int minor)
 
				 
			
 
				 	md->queue = blk_alloc_queue(GFP_KERNEL);
			
 
				 	if (!md->queue)
			
 
				-		goto bad_queue;
			
 
				+		goto bad;
			
 
				 
			
 
				 	dm_init_md_queue(md);
			
 
				 
			
 
				 	md->disk = alloc_disk(1);
			
 
				 	if (!md->disk)
			
 
				-		goto bad_disk;
			
 
				+		goto bad;
			
 
				 
			
 
				 	atomic_set(&md->pending[0], 0);
			
 
				 	atomic_set(&md->pending[1], 0);
			
@@ -2245,11 +2297,11 @@ static struct mapped_device *alloc_dev(int minor)
 
				 
			
 
				 	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
			
 
				 	if (!md->wq)
			
 
				-		goto bad_thread;
			
 
				+		goto bad;
			
 
				 
			
 
				 	md->bdev = bdget_disk(md->disk, 0);
			
 
				 	if (!md->bdev)
			
 
				-		goto bad_bdev;
			
 
				+		goto bad;
			
 
				 
			
 
				 	bio_init(&md->flush_bio);
			
 
				 	md->flush_bio.bi_bdev = md->bdev;
			
@@ -2266,15 +2318,8 @@ static struct mapped_device *alloc_dev(int minor)
 
				 
			
 
				 	return md;
			
 
				 
			
 
				-bad_bdev:
			
 
				-	destroy_workqueue(md->wq);
			
 
				-bad_thread:
			
 
				-	del_gendisk(md->disk);
			
 
				-	put_disk(md->disk);
			
 
				-bad_disk:
			
 
				-	blk_cleanup_queue(md->queue);
			
 
				-bad_queue:
			
 
				-	cleanup_srcu_struct(&md->io_barrier);
			
 
				+bad:
			
 
				+	cleanup_mapped_device(md);
			
 
				 bad_io_barrier:
			
 
				 	free_minor(minor);
			
 
				 bad_minor:
			
@@ -2291,71 +2336,65 @@ static void free_dev(struct mapped_device *md)
 
				 	int minor = MINOR(disk_devt(md->disk));
			
 
				 
			
 
				 	unlock_fs(md);
			
 
				-	destroy_workqueue(md->wq);
			
 
				 
			
 
				-	if (md->kworker_task)
			
 
				-		kthread_stop(md->kworker_task);
			
 
				-	if (md->io_pool)
			
 
				-		mempool_destroy(md->io_pool);
			
 
				-	if (md->rq_pool)
			
 
				-		mempool_destroy(md->rq_pool);
			
 
				-	if (md->bs)
			
 
				-		bioset_free(md->bs);
			
 
				+	cleanup_mapped_device(md);
			
 
				+	if (md->use_blk_mq)
			
 
				+		blk_mq_free_tag_set(&md->tag_set);
			
 
				 
			
 
				-	cleanup_srcu_struct(&md->io_barrier);
			
 
				 	free_table_devices(&md->table_devices);
			
 
				 	dm_stats_cleanup(&md->stats);
			
 
				-
			
 
				-	spin_lock(&_minor_lock);
			
 
				-	md->disk->private_data = NULL;
			
 
				-	spin_unlock(&_minor_lock);
			
 
				-	if (blk_get_integrity(md->disk))
			
 
				-		blk_integrity_unregister(md->disk);
			
 
				-	del_gendisk(md->disk);
			
 
				-	put_disk(md->disk);
			
 
				-	blk_cleanup_queue(md->queue);
			
 
				-	if (md->use_blk_mq)
			
 
				-		blk_mq_free_tag_set(&md->tag_set);
			
 
				-	bdput(md->bdev);
			
 
				 	free_minor(minor);
			
 
				 
			
 
				 	module_put(THIS_MODULE);
			
 
				 	kfree(md);
			
 
				 }
			
 
				 
			
 
				+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
			
 
				+{
			
 
				+	if (type == DM_TYPE_BIO_BASED)
			
 
				+		return type;
			
 
				+
			
 
				+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
			
 
				+}
			
 
				+
			
 
				 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
			
 
				 {
			
 
				 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
			
 
				 
			
 
				-	if (md->bs) {
			
 
				-		/* The md already has necessary mempools. */
			
 
				-		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
			
 
				+	switch (filter_md_type(dm_table_get_type(t), md)) {
			
 
				+	case DM_TYPE_BIO_BASED:
			
 
				+		if (md->bs && md->io_pool) {
			
 
				 			/*
			
 
				+			 * This bio-based md already has necessary mempools.
			
 
				 			 * Reload bioset because front_pad may have changed
			
 
				 			 * because a different table was loaded.
			
 
				 			 */
			
 
				 			bioset_free(md->bs);
			
 
				 			md->bs = p->bs;
			
 
				 			p->bs = NULL;
			
 
				+			goto out;
			
 
				 		}
			
 
				-		/*
			
 
				-		 * There's no need to reload with request-based dm
			
 
				-		 * because the size of front_pad doesn't change.
			
 
				-		 * Note for future: If you are to reload bioset,
			
 
				-		 * prep-ed requests in the queue may refer
			
 
				-		 * to bio from the old bioset, so you must walk
			
 
				-		 * through the queue to unprep.
			
 
				-		 */
			
 
				-		goto out;
			
 
				+		break;
			
 
				+	case DM_TYPE_REQUEST_BASED:
			
 
				+		if (md->rq_pool && md->io_pool)
			
 
				+			/*
			
 
				+			 * This request-based md already has necessary mempools.
			
 
				+			 */
			
 
				+			goto out;
			
 
				+		break;
			
 
				+	case DM_TYPE_MQ_REQUEST_BASED:
			
 
				+		BUG_ON(p); /* No mempools needed */
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				+	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
			
 
				+
			
 
				 	md->io_pool = p->io_pool;
			
 
				 	p->io_pool = NULL;
			
 
				 	md->rq_pool = p->rq_pool;
			
 
				 	p->rq_pool = NULL;
			
 
				 	md->bs = p->bs;
			
 
				 	p->bs = NULL;
			
 
				-
			
 
				 out:
			
 
				 	/* mempool bind completed, no longer need any mempools in the table */
			
 
				 	dm_table_free_md_mempools(t);
			
@@ -2675,6 +2714,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 
				 		/* Direct call is fine since .queue_rq allows allocations */
			
 
				 		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
			
 
				 			/* Undo dm_start_request() before requeuing */
			
 
				+			rq_end_stats(md, rq);
			
 
				 			rq_completed(md, rq_data_dir(rq), false);
			
 
				 			return BLK_MQ_RQ_QUEUE_BUSY;
			
 
				 		}
			
@@ -2734,14 +2774,6 @@ out_tag_set:
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
			
 
				-{
			
 
				-	if (type == DM_TYPE_BIO_BASED)
			
 
				-		return type;
			
 
				-
			
 
				-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Setup the DM device's queue based on md's type
			
 
				  */
			
@@ -3463,7 +3495,7 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 
				 
			
 
				 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
			
 
				 	if (!pools)
			
 
				-		return NULL;
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				 	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
			
 
				 		offsetof(struct dm_target_io, clone);
			
@@ -3482,24 +3514,26 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 
				 	return pools;
			
 
				 out:
			
 
				 	dm_free_md_mempools(pools);
			
 
				-	return NULL;
			
 
				+	return ERR_PTR(-ENOMEM);
			
 
				 }
			
 
				 
			
 
				 struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
			
 
				 					    unsigned type)
			
 
				 {
			
 
				-	unsigned int pool_size = dm_get_reserved_rq_based_ios();
			
 
				+	unsigned int pool_size;
			
 
				 	struct dm_md_mempools *pools;
			
 
				 
			
 
				+	if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
			
 
				+		return NULL; /* No mempools needed */
			
 
				+
			
 
				+	pool_size = dm_get_reserved_rq_based_ios();
			
 
				 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
			
 
				 	if (!pools)
			
 
				-		return NULL;
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				-	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
			
 
				-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
			
 
				-		if (!pools->rq_pool)
			
 
				-			goto out;
			
 
				-	}
			
 
				+	pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
			
 
				+	if (!pools->rq_pool)
			
 
				+		goto out;
			
 
				 
			
 
				 	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
			
 
				 	if (!pools->io_pool)
			
@@ -3508,7 +3542,7 @@ struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 
				 	return pools;
			
 
				 out:
			
 
				 	dm_free_md_mempools(pools);
			
 
				-	return NULL;
			
 
				+	return ERR_PTR(-ENOMEM);
			
 
				 }
			
 
				 
			
 
				 void dm_free_md_mempools(struct dm_md_mempools *pools)
			
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -609,6 +609,12 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
 
				 	dm_bufio_prefetch(bm->bufio, b, 1);
			
 
				 }
			
 
				 
			
 
				+bool dm_bm_is_read_only(struct dm_block_manager *bm)
			
 
				+{
			
 
				+	return bm->read_only;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
			
 
				+
			
 
				 void dm_bm_set_read_only(struct dm_block_manager *bm)
			
 
				 {
			
 
				 	bm->read_only = true;
			
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -123,6 +123,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 
				  * Additionally you should not use dm_bm_unlock_move, however no error will
			
 
				  * be returned if you do.
			
 
				  */
			
 
				+bool dm_bm_is_read_only(struct dm_block_manager *bm);
			
 
				 void dm_bm_set_read_only(struct dm_block_manager *bm);
			
 
				 void dm_bm_set_read_write(struct dm_block_manager *bm);
			
 
				 
			
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -590,3 +590,130 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 
				 	return r;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_btree_remove);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
			
 
				+			  struct dm_btree_value_type *vt, dm_block_t root,
			
 
				+			  uint64_t key, int *index)
			
 
				+{
			
 
				+	int i = *index, r;
			
 
				+	struct btree_node *n;
			
 
				+
			
 
				+	for (;;) {
			
 
				+		r = shadow_step(s, root, vt);
			
 
				+		if (r < 0)
			
 
				+			break;
			
 
				+
			
 
				+		/*
			
 
				+		 * We have to patch up the parent node, ugly, but I don't
			
 
				+		 * see a way to do this automatically as part of the spine
			
 
				+		 * op.
			
 
				+		 */
			
 
				+		if (shadow_has_parent(s)) {
			
 
				+			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
			
 
				+			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
			
 
				+			       &location, sizeof(__le64));
			
 
				+		}
			
 
				+
			
 
				+		n = dm_block_data(shadow_current(s));
			
 
				+
			
 
				+		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
			
 
				+			*index = lower_bound(n, key);
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		r = rebalance_children(s, info, vt, key);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+
			
 
				+		n = dm_block_data(shadow_current(s));
			
 
				+		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
			
 
				+			*index = lower_bound(n, key);
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		i = lower_bound(n, key);
			
 
				+
			
 
				+		/*
			
 
				+		 * We know the key is present, or else
			
 
				+		 * rebalance_children would have returned
			
 
				+		 * -ENODATA
			
 
				+		 */
			
 
				+		root = value64(n, i);
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int remove_one(struct dm_btree_info *info, dm_block_t root,
			
 
				+		      uint64_t *keys, uint64_t end_key,
			
 
				+		      dm_block_t *new_root, unsigned *nr_removed)
			
 
				+{
			
 
				+	unsigned level, last_level = info->levels - 1;
			
 
				+	int index = 0, r = 0;
			
 
				+	struct shadow_spine spine;
			
 
				+	struct btree_node *n;
			
 
				+	uint64_t k;
			
 
				+
			
 
				+	init_shadow_spine(&spine, info);
			
 
				+	for (level = 0; level < last_level; level++) {
			
 
				+		r = remove_raw(&spine, info, &le64_type,
			
 
				+			       root, keys[level], (unsigned *) &index);
			
 
				+		if (r < 0)
			
 
				+			goto out;
			
 
				+
			
 
				+		n = dm_block_data(shadow_current(&spine));
			
 
				+		root = value64(n, index);
			
 
				+	}
			
 
				+
			
 
				+	r = remove_nearest(&spine, info, &info->value_type,
			
 
				+			   root, keys[last_level], &index);
			
 
				+	if (r < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	n = dm_block_data(shadow_current(&spine));
			
 
				+
			
 
				+	if (index < 0)
			
 
				+		index = 0;
			
 
				+
			
 
				+	if (index >= le32_to_cpu(n->header.nr_entries)) {
			
 
				+		r = -ENODATA;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	k = le64_to_cpu(n->keys[index]);
			
 
				+	if (k >= keys[last_level] && k < end_key) {
			
 
				+		if (info->value_type.dec)
			
 
				+			info->value_type.dec(info->value_type.context,
			
 
				+					     value_ptr(n, index));
			
 
				+
			
 
				+		delete_at(n, index);
			
 
				+
			
 
				+	} else
			
 
				+		r = -ENODATA;
			
 
				+
			
 
				+out:
			
 
				+	*new_root = shadow_root(&spine);
			
 
				+	exit_shadow_spine(&spine);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
			
 
				+			   uint64_t *first_key, uint64_t end_key,
			
 
				+			   dm_block_t *new_root, unsigned *nr_removed)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	*nr_removed = 0;
			
 
				+	do {
			
 
				+		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
			
 
				+		if (!r)
			
 
				+			(*nr_removed)++;
			
 
				+	} while (!r);
			
 
				+
			
 
				+	*new_root = root;
			
 
				+	return r == -ENODATA ? 0 : r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
			
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -134,6 +134,15 @@ int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 
				 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
			
 
				 		    uint64_t *keys, dm_block_t *new_root);
			
 
				 
			
 
				+/*
			
 
				+ * Removes values between 'keys' and keys2, where keys2 is keys with the
			
 
				+ * final key replaced with 'end_key'.  'end_key' is the one-past-the-end
			
 
				+ * value.  'keys' may be altered.
			
 
				+ */
			
 
				+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
			
 
				+			   uint64_t *keys, uint64_t end_key,
			
 
				+			   dm_block_t *new_root, unsigned *nr_removed);
			
 
				+
			
 
				 /*
			
 
				  * Returns < 0 on failure.  Otherwise the number of key entries that have
			
 
				  * been filled out.  Remember trees can have zero entries, and as such have
			
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -204,6 +204,27 @@ static void in(struct sm_metadata *smm)
 
				 	smm->recursion_count++;
			
 
				 }
			
 
				 
			
 
				+static int apply_bops(struct sm_metadata *smm)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+
			
 
				+	while (!brb_empty(&smm->uncommitted)) {
			
 
				+		struct block_op bop;
			
 
				+
			
 
				+		r = brb_pop(&smm->uncommitted, &bop);
			
 
				+		if (r) {
			
 
				+			DMERR("bug in bop ring buffer");
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		r = commit_bop(smm, &bop);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				 static int out(struct sm_metadata *smm)
			
 
				 {
			
 
				 	int r = 0;
			
@@ -216,21 +237,8 @@ static int out(struct sm_metadata *smm)
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				-	if (smm->recursion_count == 1) {
			
 
				-		while (!brb_empty(&smm->uncommitted)) {
			
 
				-			struct block_op bop;
			
 
				-
			
 
				-			r = brb_pop(&smm->uncommitted, &bop);
			
 
				-			if (r) {
			
 
				-				DMERR("bug in bop ring buffer");
			
 
				-				break;
			
 
				-			}
			
 
				-
			
 
				-			r = commit_bop(smm, &bop);
			
 
				-			if (r)
			
 
				-				break;
			
 
				-		}
			
 
				-	}
			
 
				+	if (smm->recursion_count == 1)
			
 
				+		apply_bops(smm);
			
 
				 
			
 
				 	smm->recursion_count--;
			
 
				 
			
@@ -704,6 +712,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 
				 		}
			
 
				 		old_len = smm->begin;
			
 
				 
			
 
				+		r = apply_bops(smm);
			
 
				+		if (r) {
			
 
				+			DMERR("%s: apply_bops failed", __func__);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				 		r = sm_ll_commit(&smm->ll);
			
 
				 		if (r)
			
 
				 			goto out;
			
@@ -773,6 +787,12 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 
				 	if (r)
			
 
				 		return r;
			
 
				 
			
 
				+	r = apply_bops(smm);
			
 
				+	if (r) {
			
 
				+		DMERR("%s: apply_bops failed", __func__);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				 	return sm_metadata_commit(sm);
			
 
				 }