10 лет назад · afad97eee4
--- a/Documentation/ABI/testing/sysfs-block-dm
+++ b/Documentation/ABI/testing/sysfs-block-dm
@@ -23,3 +23,25 @@ Description:	Device-mapper device suspend state.
 
				 		Contains the value 1 while the device is suspended.
			
 
				 		Otherwise it contains 0. Read-only attribute.
			
 
				 Users:		util-linux, device-mapper udev rules
			
 
				+
			
 
				+What:		/sys/block/dm-<num>/dm/rq_based_seq_io_merge_deadline
			
 
				+Date:		March 2015
			
 
				+KernelVersion:	4.1
			
 
				+Contact:	dm-devel@redhat.com
			
 
				+Description:	Allow control over how long a request that is a
			
 
				+		reasonable merge candidate can be queued on the request
			
 
				+		queue.  The resolution of this deadline is in
			
 
				+		microseconds (ranging from 1 to 100000 usecs).
			
 
				+		Setting this attribute to 0 (the default) will disable
			
 
				+		request-based DM's merge heuristic and associated extra
			
 
				+		accounting.  This attribute is not applicable to
			
 
				+		bio-based DM devices so it will only ever report 0 for
			
 
				+		them.
			
 
				+
			
 
				+What:		/sys/block/dm-<num>/dm/use_blk_mq
			
 
				+Date:		March 2015
			
 
				+KernelVersion:	4.1
			
 
				+Contact:	dm-devel@redhat.com
			
 
				+Description:	Request-based Device-mapper blk-mq I/O path mode.
			
 
				+		Contains the value 1 if the device is using blk-mq.
			
 
				+		Otherwise it contains 0. Read-only attribute.
			
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -5,7 +5,7 @@ Device-Mapper's "crypt" target provides transparent encryption of block devices
 
				 using the kernel crypto API.
			
 
				 
			
 
				 For a more detailed description of supported parameters see:
			
 
				-http://code.google.com/p/cryptsetup/wiki/DMCrypt
			
 
				+https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
			
 
				 
			
 
				 Parameters: <cipher> <key> <iv_offset> <device path> \
			
 
				 	      <offset> [<#opt_params> <opt_params>]
			
@@ -80,7 +80,7 @@ Example scripts
 
				 ===============
			
 
				 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
			
 
				 encryption with dm-crypt using the 'cryptsetup' utility, see
			
 
				-http://code.google.com/p/cryptsetup/
			
 
				+https://gitlab.com/cryptsetup/cryptsetup
			
 
				 
			
 
				 [[
			
 
				 #!/bin/sh
			
--- a/Documentation/device-mapper/log-writes.txt
+++ b/Documentation/device-mapper/log-writes.txt
@@ -0,0 +1,140 @@
 
				+dm-log-writes
			
 
				+=============
			
 
				+
			
 
				+This target takes 2 devices, one to pass all IO to normally, and one to log all
			
 
				+of the write operations to.  This is intended for file system developers wishing
			
 
				+to verify the integrity of metadata or data as the file system is written to.
			
 
				+There is a log_write_entry written for every WRITE request and the target is
			
 
				+able to take arbitrary data from userspace to insert into the log.  The data
			
 
				+that is in the WRITE requests is copied into the log to make the replay happen
			
 
				+exactly as it happened originally.
			
 
				+
			
 
				+Log Ordering
			
 
				+============
			
 
				+
			
 
				+We log things in order of completion once we are sure the write is no longer in
			
 
				+cache.  This means that normal WRITE requests are not actually logged until the
			
 
				+next REQ_FLUSH request.  This is to make it easier for userspace to replay the
			
 
				+log in a way that correlates to what is on disk and not what is in cache, to
			
 
				+make it easier to detect improper waiting/flushing.
			
 
				+
			
 
				+This works by attaching all WRITE requests to a list once the write completes.
			
 
				+Once we see a REQ_FLUSH request we splice this list onto the request and once
			
 
				+the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
			
 
				+completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to
			
 
				+simulate the worst case scenario with regard to power failures.  Consider the
			
 
				+following example (W means write, C means complete):
			
 
				+
			
 
				+W1,W2,W3,C3,C2,Wflush,C1,Cflush
			
 
				+
			
 
				+The log would show the following
			
 
				+
			
 
				+W3,W2,flush,W1....
			
 
				+
			
 
				+Again this is to simulate what is actually on disk, this allows us to detect
			
 
				+cases where a power failure at a particular point in time would create an
			
 
				+inconsistent file system.
			
 
				+
			
 
				+Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
			
 
				+they complete as those requests will obviously bypass the device cache.
			
 
				+
			
 
				+Any REQ_DISCARD requests are treated like WRITE requests.  Otherwise we would
			
 
				+have all the DISCARD requests, and then the WRITE requests and then the FLUSH
			
 
				+request.  Consider the following example:
			
 
				+
			
 
				+WRITE block 1, DISCARD block 1, FLUSH
			
 
				+
			
 
				+If we logged DISCARD when it completed, the replay would look like this
			
 
				+
			
 
				+DISCARD 1, WRITE 1, FLUSH
			
 
				+
			
 
				+which isn't quite what happened and wouldn't be caught during the log replay.
			
 
				+
			
 
				+Target interface
			
 
				+================
			
 
				+
			
 
				+i) Constructor
			
 
				+
			
 
				+   log-writes <dev_path> <log_dev_path>
			
 
				+
			
 
				+   dev_path	: Device that all of the IO will go to normally.
			
 
				+   log_dev_path : Device where the log entries are written to.
			
 
				+
			
 
				+ii) Status
			
 
				+
			
 
				+    <#logged entries> <highest allocated sector>
			
 
				+
			
 
				+    #logged entries	       : Number of logged entries
			
 
				+    highest allocated sector   : Highest allocated sector
			
 
				+
			
 
				+iii) Messages
			
 
				+
			
 
				+    mark <description>
			
 
				+
			
 
				+	You can use a dmsetup message to set an arbitrary mark in a log.
			
 
				+	For example say you want to fsck a file system after every
			
 
				+	write, but first you need to replay up to the mkfs to make sure
			
 
				+	we're fsck'ing something reasonable, you would do something like
			
 
				+	this:
			
 
				+
			
 
				+	  mkfs.btrfs -f /dev/mapper/log
			
 
				+	  dmsetup message log 0 mark mkfs
			
 
				+	  <run test>
			
 
				+
			
 
				+	  This would allow you to replay the log up to the mkfs mark and
			
 
				+	  then replay from that point on doing the fsck check in the
			
 
				+	  interval that you want.
			
 
				+
			
 
				+	Every log has a mark at the end labeled "dm-log-writes-end".
			
 
				+
			
 
				+Userspace component
			
 
				+===================
			
 
				+
			
 
				+There is a userspace tool that will replay the log for you in various ways.
			
 
				+It can be found here: https://github.com/josefbacik/log-writes
			
 
				+
			
 
				+Example usage
			
 
				+=============
			
 
				+
			
 
				+Say you want to test fsync on your file system.  You would do something like
			
 
				+this:
			
 
				+
			
 
				+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
			
 
				+dmsetup create log --table "$TABLE"
			
 
				+mkfs.btrfs -f /dev/mapper/log
			
 
				+dmsetup message log 0 mark mkfs
			
 
				+
			
 
				+mount /dev/mapper/log /mnt/btrfs-test
			
 
				+<some test that does fsync at the end>
			
 
				+dmsetup message log 0 mark fsync
			
 
				+md5sum /mnt/btrfs-test/foo
			
 
				+umount /mnt/btrfs-test
			
 
				+
			
 
				+dmsetup remove log
			
 
				+replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync
			
 
				+mount /dev/sdb /mnt/btrfs-test
			
 
				+md5sum /mnt/btrfs-test/foo
			
 
				+<verify md5sum's are correct>
			
 
				+
			
 
				+Another option is to do a complicated file system operation and verify the file
			
 
				+system is consistent during the entire operation.  You could do this with:
			
 
				+
			
 
				+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
			
 
				+dmsetup create log --table "$TABLE"
			
 
				+mkfs.btrfs -f /dev/mapper/log
			
 
				+dmsetup message log 0 mark mkfs
			
 
				+
			
 
				+mount /dev/mapper/log /mnt/btrfs-test
			
 
				+<fsstress to dirty the fs>
			
 
				+btrfs filesystem balance /mnt/btrfs-test
			
 
				+umount /mnt/btrfs-test
			
 
				+dmsetup remove log
			
 
				+
			
 
				+replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs
			
 
				+btrfsck /dev/sdb
			
 
				+replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \
			
 
				+	--fsck "btrfsck /dev/sdb" --check fua
			
 
				+
			
 
				+And that will replay the log until it sees a FUA request, run the fsck command
			
 
				+and if the fsck passes it will replay to the next FUA, until it is completed or
			
 
				+the fsck command exists abnormally.
			
--- a/Documentation/device-mapper/switch.txt
+++ b/Documentation/device-mapper/switch.txt
@@ -47,8 +47,8 @@ consume far too much memory.
 
				 Using this device-mapper switch target we can now build a two-layer
			
 
				 device hierarchy:
			
 
				 
			
 
				-    Upper Tier – Determine which array member the I/O should be sent to.
			
 
				-    Lower Tier – Load balance amongst paths to a particular member.
			
 
				+    Upper Tier - Determine which array member the I/O should be sent to.
			
 
				+    Lower Tier - Load balance amongst paths to a particular member.
			
 
				 
			
 
				 The lower tier consists of a single dm multipath device for each member.
			
 
				 Each of these multipath devices contains the set of paths directly to
			
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -380,9 +380,6 @@ then you'll have no access to blocks mapped beyond the end.  If you
 
				 load a target that is bigger than before, then extra blocks will be
			
 
				 provisioned as and when needed.
			
 
				 
			
 
				-If you wish to reduce the size of your thin device and potentially
			
 
				-regain some space then send the 'trim' message to the pool.
			
 
				-
			
 
				 ii) Status
			
 
				 
			
 
				      <nr mapped sectors> <highest mapped sector>
			
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -11,6 +11,7 @@ Construction Parameters
 
				     <data_block_size> <hash_block_size>
			
 
				     <num_data_blocks> <hash_start_block>
			
 
				     <algorithm> <digest> <salt>
			
 
				+    [<#opt_params> <opt_params>]
			
 
				 
			
 
				 <version>
			
 
				     This is the type of the on-disk hash format.
			
@@ -62,6 +63,22 @@ Construction Parameters
 
				 <salt>
			
 
				     The hexadecimal encoding of the salt value.
			
 
				 
			
 
				+<#opt_params>
			
 
				+    Number of optional parameters. If there are no optional parameters,
			
 
				+    the optional paramaters section can be skipped or #opt_params can be zero.
			
 
				+    Otherwise #opt_params is the number of following arguments.
			
 
				+
			
 
				+    Example of optional parameters section:
			
 
				+        1 ignore_corruption
			
 
				+
			
 
				+ignore_corruption
			
 
				+    Log corrupted blocks, but allow read operations to proceed normally.
			
 
				+
			
 
				+restart_on_corruption
			
 
				+    Restart the system when a corrupted block is discovered. This option is
			
 
				+    not compatible with ignore_corruption and requires user space support to
			
 
				+    avoid restart loops.
			
 
				+
			
 
				 Theory of operation
			
 
				 ===================
			
 
				 
			
@@ -125,7 +142,7 @@ block boundary) are the hash blocks which are stored a depth at a time
 
				 
			
 
				 The full specification of kernel parameters and on-disk metadata format
			
 
				 is available at the cryptsetup project's wiki page
			
 
				-  http://code.google.com/p/cryptsetup/wiki/DMVerity
			
 
				+  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity
			
 
				 
			
 
				 Status
			
 
				 ======
			
@@ -142,7 +159,7 @@ Set up a device:
 
				 
			
 
				 A command line tool veritysetup is available to compute or verify
			
 
				 the hash tree or activate the kernel device. This is available from
			
 
				-the cryptsetup upstream repository http://code.google.com/p/cryptsetup/
			
 
				+the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
			
 
				 (as a libcryptsetup extension).
			
 
				 
			
 
				 Create hash on the device:
			
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -196,6 +196,17 @@ config BLK_DEV_DM
 
				 
			
 
				 	  If unsure, say N.
			
 
				 
			
 
				+config DM_MQ_DEFAULT
			
 
				+	bool "request-based DM: use blk-mq I/O path by default"
			
 
				+	depends on BLK_DEV_DM
			
 
				+	---help---
			
 
				+	  This option enables the blk-mq based I/O path for request-based
			
 
				+	  DM devices by default.  With the option the dm_mod.use_blk_mq
			
 
				+	  module/boot option defaults to Y, without it to N, but it can
			
 
				+	  still be overriden either way.
			
 
				+
			
 
				+	  If unsure say N.
			
 
				+
			
 
				 config DM_DEBUG
			
 
				 	bool "Device mapper debugging support"
			
 
				 	depends on BLK_DEV_DM
			
@@ -432,4 +443,20 @@ config DM_SWITCH
 
				 
			
 
				 	  If unsure, say N.
			
 
				 
			
 
				+config DM_LOG_WRITES
			
 
				+	tristate "Log writes target support"
			
 
				+	depends on BLK_DEV_DM
			
 
				+	---help---
			
 
				+	  This device-mapper target takes two devices, one device to use
			
 
				+	  normally, one to log all write operations done to the first device.
			
 
				+	  This is for use by file system developers wishing to verify that
			
 
				+	  their fs is writing a consitent file system at all times by allowing
			
 
				+	  them to replay the log in a variety of ways and to check the
			
 
				+	  contents.
			
 
				+
			
 
				+	  To compile this code as a module, choose M here: the module will
			
 
				+	  be called dm-log-writes.
			
 
				+
			
 
				+	  If unsure, say N.
			
 
				+
			
 
				 endif # MD
			
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 
				 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
			
 
				 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
			
 
				 obj-$(CONFIG_DM_ERA)		+= dm-era.o
			
 
				+obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
			
 
				 
			
 
				 ifeq ($(CONFIG_DM_UEVENT),y)
			
 
				 dm-mod-objs			+= dm-uevent.o
			
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
 
				 #include "dm.h"
			
 
				 
			
 
				 #include <linux/hash.h>
			
 
				+#include <linux/jiffies.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/mutex.h>
			
 
				 #include <linux/slab.h>
			
@@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 
				  * sorted queue.
			
 
				  */
			
 
				 #define NR_QUEUE_LEVELS 16u
			
 
				+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
			
 
				+
			
 
				+#define WRITEBACK_PERIOD HZ
			
 
				 
			
 
				 struct queue {
			
 
				+	unsigned nr_elts;
			
 
				+	bool current_writeback_sentinels;
			
 
				+	unsigned long next_writeback;
			
 
				 	struct list_head qs[NR_QUEUE_LEVELS];
			
 
				+	struct list_head sentinels[NR_SENTINELS];
			
 
				 };
			
 
				 
			
 
				 static void queue_init(struct queue *q)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 
			
 
				-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
			
 
				+	q->nr_elts = 0;
			
 
				+	q->current_writeback_sentinels = false;
			
 
				+	q->next_writeback = 0;
			
 
				+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
			
 
				 		INIT_LIST_HEAD(q->qs + i);
			
 
				+		INIT_LIST_HEAD(q->sentinels + i);
			
 
				+		INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
			
 
				+		INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Checks to see if the queue is empty.
			
 
				- * FIXME: reduce cpu usage.
			
 
				- */
			
 
				-static bool queue_empty(struct queue *q)
			
 
				+static unsigned queue_size(struct queue *q)
			
 
				 {
			
 
				-	unsigned i;
			
 
				-
			
 
				-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
			
 
				-		if (!list_empty(q->qs + i))
			
 
				-			return false;
			
 
				+	return q->nr_elts;
			
 
				+}
			
 
				 
			
 
				-	return true;
			
 
				+static bool queue_empty(struct queue *q)
			
 
				+{
			
 
				+	return q->nr_elts == 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q)
 
				  */
			
 
				 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
			
 
				 {
			
 
				+	q->nr_elts++;
			
 
				 	list_add_tail(elt, q->qs + level);
			
 
				 }
			
 
				 
			
 
				-static void queue_remove(struct list_head *elt)
			
 
				+static void queue_remove(struct queue *q, struct list_head *elt)
			
 
				 {
			
 
				+	q->nr_elts--;
			
 
				 	list_del(elt);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Shifts all regions down one level.  This has no effect on the order of
			
 
				- * the queue.
			
 
				- */
			
 
				-static void queue_shift_down(struct queue *q)
			
 
				+static bool is_sentinel(struct queue *q, struct list_head *h)
			
 
				 {
			
 
				-	unsigned level;
			
 
				-
			
 
				-	for (level = 1; level < NR_QUEUE_LEVELS; level++)
			
 
				-		list_splice_init(q->qs + level, q->qs + level - 1);
			
 
				+	return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q)
 
				 static struct list_head *queue_peek(struct queue *q)
			
 
				 {
			
 
				 	unsigned level;
			
 
				+	struct list_head *h;
			
 
				 
			
 
				 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
			
 
				-		if (!list_empty(q->qs + level))
			
 
				-			return q->qs[level].next;
			
 
				+		list_for_each(h, q->qs + level)
			
 
				+			if (!is_sentinel(q, h))
			
 
				+				return h;
			
 
				 
			
 
				 	return NULL;
			
 
				 }
			
@@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q)
 
				 	struct list_head *r = queue_peek(q);
			
 
				 
			
 
				 	if (r) {
			
 
				+		q->nr_elts--;
			
 
				 		list_del(r);
			
 
				-
			
 
				-		/* have we just emptied the bottom level? */
			
 
				-		if (list_empty(q->qs))
			
 
				-			queue_shift_down(q);
			
 
				 	}
			
 
				 
			
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Pops an entry from a level that is not past a sentinel.
			
 
				+ */
			
 
				+static struct list_head *queue_pop_old(struct queue *q)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+	struct list_head *h;
			
 
				+
			
 
				+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
			
 
				+		list_for_each(h, q->qs + level) {
			
 
				+			if (is_sentinel(q, h))
			
 
				+				break;
			
 
				+
			
 
				+			q->nr_elts--;
			
 
				+			list_del(h);
			
 
				+			return h;
			
 
				+		}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				 static struct list_head *list_pop(struct list_head *lh)
			
 
				 {
			
 
				 	struct list_head *r = lh->next;
			
@@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
			
 
				+{
			
 
				+	if (q->current_writeback_sentinels)
			
 
				+		return q->sentinels + NR_QUEUE_LEVELS + level;
			
 
				+	else
			
 
				+		return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
			
 
				+}
			
 
				+
			
 
				+static void queue_update_writeback_sentinels(struct queue *q)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	struct list_head *h;
			
 
				+
			
 
				+	if (time_after(jiffies, q->next_writeback)) {
			
 
				+		for (i = 0; i < NR_QUEUE_LEVELS; i++) {
			
 
				+			h = writeback_sentinel(q, i);
			
 
				+			list_del(h);
			
 
				+			list_add_tail(h, q->qs + i);
			
 
				+		}
			
 
				+
			
 
				+		q->next_writeback = jiffies + WRITEBACK_PERIOD;
			
 
				+		q->current_writeback_sentinels = !q->current_writeback_sentinels;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Sometimes we want to iterate through entries that have been pushed since
			
 
				+ * a certain event.  We use sentinel entries on the queues to delimit these
			
 
				+ * 'tick' events.
			
 
				+ */
			
 
				+static void queue_tick(struct queue *q)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
			
 
				+		list_del(q->sentinels + i);
			
 
				+		list_add_tail(q->sentinels + i, q->qs + i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+typedef void (*iter_fn)(struct list_head *, void *);
			
 
				+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	struct list_head *h;
			
 
				+
			
 
				+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
			
 
				+		list_for_each_prev(h, q->qs + i) {
			
 
				+			if (is_sentinel(q, h))
			
 
				+				break;
			
 
				+
			
 
				+			fn(h, context);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 /*
			
@@ -232,8 +313,6 @@ struct entry {
 
				 	 */
			
 
				 	bool dirty:1;
			
 
				 	unsigned hit_count;
			
 
				-	unsigned generation;
			
 
				-	unsigned tick;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
 
				  */
			
 
				 static void push(struct mq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	e->tick = mq->tick;
			
 
				 	hash_insert(mq, e);
			
 
				 
			
 
				 	if (in_cache(mq, e))
			
@@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e)
 
				  */
			
 
				 static void del(struct mq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	queue_remove(&e->list);
			
 
				+	if (in_cache(mq, e))
			
 
				+		queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
			
 
				+	else
			
 
				+		queue_remove(&mq->pre_cache, &e->list);
			
 
				+
			
 
				 	hash_remove(e);
			
 
				 }
			
 
				 
			
@@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
 
				 	return e;
			
 
				 }
			
 
				 
			
 
				-static struct entry *peek(struct queue *q)
			
 
				+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
			
 
				 {
			
 
				-	struct list_head *h = queue_peek(q);
			
 
				-	return h ? container_of(h, struct entry, list) : NULL;
			
 
				+	struct entry *e;
			
 
				+	struct list_head *h = queue_pop_old(q);
			
 
				+
			
 
				+	if (!h)
			
 
				+		return NULL;
			
 
				+
			
 
				+	e = container_of(h, struct entry, list);
			
 
				+	hash_remove(e);
			
 
				+
			
 
				+	return e;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Has this entry already been updated?
			
 
				- */
			
 
				-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
			
 
				+static struct entry *peek(struct queue *q)
			
 
				 {
			
 
				-	return mq->tick == e->tick;
			
 
				+	struct list_head *h = queue_peek(q);
			
 
				+	return h ? container_of(h, struct entry, list) : NULL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq)
 
				  * Whenever we use an entry we bump up it's hit counter, and push it to the
			
 
				  * back to it's current level.
			
 
				  */
			
 
				-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
			
 
				+static void requeue(struct mq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	if (updated_this_tick(mq, e))
			
 
				-		return;
			
 
				-
			
 
				-	e->hit_count++;
			
 
				-	mq->hit_count++;
			
 
				 	check_generation(mq);
			
 
				-
			
 
				-	/* generation adjustment, to stop the counts increasing forever. */
			
 
				-	/* FIXME: divide? */
			
 
				-	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
			
 
				-	e->generation = mq->generation;
			
 
				-
			
 
				 	del(mq, e);
			
 
				 	push(mq, e);
			
 
				 }
			
@@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq,
 
				 			     struct entry *e,
			
 
				 			     struct policy_result *result)
			
 
				 {
			
 
				-	requeue_and_update_tick(mq, e);
			
 
				+	requeue(mq, e);
			
 
				 
			
 
				 	if (in_cache(mq, e)) {
			
 
				 		result->op = POLICY_HIT;
			
@@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
				 	new_e->oblock = e->oblock;
			
 
				 	new_e->dirty = false;
			
 
				 	new_e->hit_count = e->hit_count;
			
 
				-	new_e->generation = e->generation;
			
 
				-	new_e->tick = e->tick;
			
 
				 
			
 
				 	del(mq, e);
			
 
				 	free_entry(&mq->pre_cache_pool, e);
			
@@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 
				 				 int data_dir, struct policy_result *result)
			
 
				 {
			
 
				 	int r = 0;
			
 
				-	bool updated = updated_this_tick(mq, e);
			
 
				 
			
 
				-	if ((!discarded_oblock && updated) ||
			
 
				-	    !should_promote(mq, e, discarded_oblock, data_dir)) {
			
 
				-		requeue_and_update_tick(mq, e);
			
 
				+	if (!should_promote(mq, e, discarded_oblock, data_dir)) {
			
 
				+		requeue(mq, e);
			
 
				 		result->op = POLICY_MISS;
			
 
				 
			
 
				 	} else if (!can_migrate)
			
 
				 		r = -EWOULDBLOCK;
			
 
				 
			
 
				 	else {
			
 
				-		requeue_and_update_tick(mq, e);
			
 
				+		requeue(mq, e);
			
 
				 		r = pre_cache_to_cache(mq, e, result);
			
 
				 	}
			
 
				 
			
@@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 
				 	e->dirty = false;
			
 
				 	e->oblock = oblock;
			
 
				 	e->hit_count = 1;
			
 
				-	e->generation = mq->generation;
			
 
				 	push(mq, e);
			
 
				 }
			
 
				 
			
@@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
				 	e->oblock = oblock;
			
 
				 	e->dirty = false;
			
 
				 	e->hit_count = 1;
			
 
				-	e->generation = mq->generation;
			
 
				 	push(mq, e);
			
 
				 
			
 
				 	result->cblock = infer_cblock(&mq->cache_pool, e);
			
@@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p)
 
				 	kfree(mq);
			
 
				 }
			
 
				 
			
 
				+static void update_pre_cache_hits(struct list_head *h, void *context)
			
 
				+{
			
 
				+	struct entry *e = container_of(h, struct entry, list);
			
 
				+	e->hit_count++;
			
 
				+}
			
 
				+
			
 
				+static void update_cache_hits(struct list_head *h, void *context)
			
 
				+{
			
 
				+	struct mq_policy *mq = context;
			
 
				+	struct entry *e = container_of(h, struct entry, list);
			
 
				+	e->hit_count++;
			
 
				+	mq->hit_count++;
			
 
				+}
			
 
				+
			
 
				 static void copy_tick(struct mq_policy *mq)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				+	unsigned long flags, tick;
			
 
				 
			
 
				 	spin_lock_irqsave(&mq->tick_lock, flags);
			
 
				-	mq->tick = mq->tick_protected;
			
 
				+	tick = mq->tick_protected;
			
 
				+	if (tick != mq->tick) {
			
 
				+		queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
			
 
				+		queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
			
 
				+		queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
			
 
				+		mq->tick = tick;
			
 
				+	}
			
 
				+
			
 
				+	queue_tick(&mq->pre_cache);
			
 
				+	queue_tick(&mq->cache_dirty);
			
 
				+	queue_tick(&mq->cache_clean);
			
 
				+	queue_update_writeback_sentinels(&mq->cache_dirty);
			
 
				 	spin_unlock_irqrestore(&mq->tick_lock, flags);
			
 
				 }
			
 
				 
			
@@ -1001,7 +1097,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
 
				 	e->oblock = oblock;
			
 
				 	e->dirty = false;	/* this gets corrected in a minute */
			
 
				 	e->hit_count = hint_valid ? hint : 1;
			
 
				-	e->generation = mq->generation;
			
 
				 	push(mq, e);
			
 
				 
			
 
				 	return 0;
			
@@ -1012,10 +1107,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 
				 {
			
 
				 	int r;
			
 
				 	unsigned level;
			
 
				+	struct list_head *h;
			
 
				 	struct entry *e;
			
 
				 
			
 
				 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
			
 
				-		list_for_each_entry(e, q->qs + level, list) {
			
 
				+		list_for_each(h, q->qs + level) {
			
 
				+			if (is_sentinel(q, h))
			
 
				+				continue;
			
 
				+
			
 
				+			e = container_of(h, struct entry, list);
			
 
				 			r = fn(context, infer_cblock(&mq->cache_pool, e),
			
 
				 			       e->oblock, e->hit_count);
			
 
				 			if (r)
			
@@ -1087,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+#define CLEAN_TARGET_PERCENTAGE 25
			
 
				+
			
 
				+static bool clean_target_met(struct mq_policy *mq)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Cache entries may not be populated.  So we're cannot rely on the
			
 
				+	 * size of the clean queue.
			
 
				+	 */
			
 
				+	unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
			
 
				+	unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
			
 
				+
			
 
				+	return nr_clean >= target;
			
 
				+}
			
 
				+
			
 
				 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
			
 
				 			      dm_cblock_t *cblock)
			
 
				 {
			
 
				-	struct entry *e = pop(mq, &mq->cache_dirty);
			
 
				+	struct entry *e = pop_old(mq, &mq->cache_dirty);
			
 
				+
			
 
				+	if (!e && !clean_target_met(mq))
			
 
				+		e = pop(mq, &mq->cache_dirty);
			
 
				 
			
 
				 	if (!e)
			
 
				 		return -ENODATA;
			
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 
				  *
			
 
				  * tcw:  Compatible implementation of the block chaining mode used
			
 
				  *       by the TrueCrypt device encryption system (prior to version 4.1).
			
 
				- *       For more info see: http://www.truecrypt.org
			
 
				+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
			
 
				  *       It operates on full 512 byte sectors and uses CBC
			
 
				  *       with an IV derived from initial key and the sector number.
			
 
				  *       In addition, whitening value is applied on every sector, whitening
			
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,
 
				 
			
 
				 		switch (r) {
			
 
				 		/* async */
			
 
				+		case -EINPROGRESS:
			
 
				 		case -EBUSY:
			
 
				 			wait_for_completion(&ctx->restart);
			
 
				 			reinit_completion(&ctx->restart);
			
 
				-			/* fall through*/
			
 
				-		case -EINPROGRESS:
			
 
				 			ctx->req = NULL;
			
 
				 			ctx->cc_sector++;
			
 
				 			continue;
			
@@ -1124,15 +1123,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 
				 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
			
 
				 {
			
 
				 	struct crypt_config *cc = io->cc;
			
 
				-	struct bio *base_bio = io->base_bio;
			
 
				 	struct bio *clone;
			
 
				 
			
 
				 	/*
			
 
				-	 * The block layer might modify the bvec array, so always
			
 
				-	 * copy the required bvecs because we need the original
			
 
				-	 * one in order to decrypt the whole bio data *afterwards*.
			
 
				+	 * We need the original biovec array in order to decrypt
			
 
				+	 * the whole bio data *afterwards* -- thanks to immutable
			
 
				+	 * biovecs we don't need to worry about the block layer
			
 
				+	 * modifying the biovec array; so leverage bio_clone_fast().
			
 
				 	 */
			
 
				-	clone = bio_clone_bioset(base_bio, gfp, cc->bs);
			
 
				+	clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
			
 
				 	if (!clone)
			
 
				 		return 1;
			
 
				 
			
@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 
				 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
			
 
				 	struct crypt_config *cc = io->cc;
			
 
				 
			
 
				-	if (error == -EINPROGRESS) {
			
 
				-		complete(&ctx->restart);
			
 
				+	if (error == -EINPROGRESS)
			
 
				 		return;
			
 
				-	}
			
 
				 
			
 
				 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
			
 
				 		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
			
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 
				 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
			
 
				 
			
 
				 	if (!atomic_dec_and_test(&ctx->cc_pending))
			
 
				-		return;
			
 
				+		goto done;
			
 
				 
			
 
				 	if (bio_data_dir(io->base_bio) == READ)
			
 
				 		kcryptd_crypt_read_done(io);
			
 
				 	else
			
 
				 		kcryptd_crypt_write_io_submit(io, 1);
			
 
				+done:
			
 
				+	if (!completion_done(&ctx->restart))
			
 
				+		complete(&ctx->restart);
			
 
				 }
			
 
				 
			
 
				 static void kcryptd_crypt(struct work_struct *work)
			
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		if (ret)
			
 
				 			goto bad;
			
 
				 
			
 
				+		ret = -EINVAL;
			
 
				 		while (opt_params--) {
			
 
				 			opt_string = dm_shift_arg(&as);
			
 
				 			if (!opt_string) {
			
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
 
				 	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
			
 
				 
			
 
				 	delayed->context = dc;
			
 
				-	delayed->expires = expires = jiffies + (delay * HZ / 1000);
			
 
				+	delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
			
 
				 
			
 
				 	mutex_lock(&delayed_bios_lock);
			
 
				 
			
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -17,7 +17,9 @@
 
				 
			
 
				 #define DM_LOG_USERSPACE_VSN "1.3.0"
			
 
				 
			
 
				-struct flush_entry {
			
 
				+#define FLUSH_ENTRY_POOL_SIZE 16
			
 
				+
			
 
				+struct dm_dirty_log_flush_entry {
			
 
				 	int type;
			
 
				 	region_t region;
			
 
				 	struct list_head list;
			
@@ -34,22 +36,14 @@ struct flush_entry {
 
				 struct log_c {
			
 
				 	struct dm_target *ti;
			
 
				 	struct dm_dev *log_dev;
			
 
				-	uint32_t region_size;
			
 
				-	region_t region_count;
			
 
				-	uint64_t luid;
			
 
				-	char uuid[DM_UUID_LEN];
			
 
				 
			
 
				 	char *usr_argv_str;
			
 
				 	uint32_t usr_argc;
			
 
				 
			
 
				-	/*
			
 
				-	 * in_sync_hint gets set when doing is_remote_recovering.  It
			
 
				-	 * represents the first region that needs recovery.  IOW, the
			
 
				-	 * first zero bit of sync_bits.  This can be useful for to limit
			
 
				-	 * traffic for calls like is_remote_recovering and get_resync_work,
			
 
				-	 * but be take care in its use for anything else.
			
 
				-	 */
			
 
				-	uint64_t in_sync_hint;
			
 
				+	uint32_t region_size;
			
 
				+	region_t region_count;
			
 
				+	uint64_t luid;
			
 
				+	char uuid[DM_UUID_LEN];
			
 
				 
			
 
				 	/*
			
 
				 	 * Mark and clear requests are held until a flush is issued
			
@@ -61,6 +55,15 @@ struct log_c {
 
				 	struct list_head mark_list;
			
 
				 	struct list_head clear_list;
			
 
				 
			
 
				+	/*
			
 
				+	 * in_sync_hint gets set when doing is_remote_recovering.  It
			
 
				+	 * represents the first region that needs recovery.  IOW, the
			
 
				+	 * first zero bit of sync_bits.  This can be useful for to limit
			
 
				+	 * traffic for calls like is_remote_recovering and get_resync_work,
			
 
				+	 * but be take care in its use for anything else.
			
 
				+	 */
			
 
				+	uint64_t in_sync_hint;
			
 
				+
			
 
				 	/*
			
 
				 	 * Workqueue for flush of clear region requests.
			
 
				 	 */
			
@@ -72,19 +75,11 @@ struct log_c {
 
				 	 * Combine userspace flush and mark requests for efficiency.
			
 
				 	 */
			
 
				 	uint32_t integrated_flush;
			
 
				-};
			
 
				-
			
 
				-static mempool_t *flush_entry_pool;
			
 
				 
			
 
				-static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
			
 
				-{
			
 
				-	return kmalloc(sizeof(struct flush_entry), gfp_mask);
			
 
				-}
			
 
				+	mempool_t *flush_entry_pool;
			
 
				+};
			
 
				 
			
 
				-static void flush_entry_free(void *element, void *pool_data)
			
 
				-{
			
 
				-	kfree(element);
			
 
				-}
			
 
				+static struct kmem_cache *_flush_entry_cache;
			
 
				 
			
 
				 static int userspace_do_request(struct log_c *lc, const char *uuid,
			
 
				 				int request_type, char *data, size_t data_size,
			
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
			
 
				+							_flush_entry_cache);
			
 
				+	if (!lc->flush_entry_pool) {
			
 
				+		DMERR("Failed to create flush_entry_pool");
			
 
				+		r = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Send table string and get back any opened device.
			
 
				 	 */
			
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
				 out:
			
 
				 	kfree(devices_rdata);
			
 
				 	if (r) {
			
 
				+		if (lc->flush_entry_pool)
			
 
				+			mempool_destroy(lc->flush_entry_pool);
			
 
				 		kfree(lc);
			
 
				 		kfree(ctr_str);
			
 
				 	} else {
			
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
 
				 	if (lc->log_dev)
			
 
				 		dm_put_device(lc->ti, lc->log_dev);
			
 
				 
			
 
				+	mempool_destroy(lc->flush_entry_pool);
			
 
				+
			
 
				 	kfree(lc->usr_argv_str);
			
 
				 	kfree(lc);
			
 
				 
			
@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 
				 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
			
 
				 {
			
 
				 	int r = 0;
			
 
				-	struct flush_entry *fe;
			
 
				+	struct dm_dirty_log_flush_entry *fe;
			
 
				 
			
 
				 	list_for_each_entry(fe, flush_list, list) {
			
 
				 		r = userspace_do_request(lc, lc->uuid, fe->type,
			
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
 
				 	int r = 0;
			
 
				 	int count;
			
 
				 	uint32_t type = 0;
			
 
				-	struct flush_entry *fe, *tmp_fe;
			
 
				+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
			
 
				 	LIST_HEAD(tmp_list);
			
 
				 	uint64_t group[MAX_FLUSH_GROUP_COUNT];
			
 
				 
			
@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
 
				 	LIST_HEAD(clear_list);
			
 
				 	int mark_list_is_empty;
			
 
				 	int clear_list_is_empty;
			
 
				-	struct flush_entry *fe, *tmp_fe;
			
 
				+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
			
 
				+	mempool_t *flush_entry_pool = lc->flush_entry_pool;
			
 
				 
			
 
				 	spin_lock_irqsave(&lc->flush_lock, flags);
			
 
				 	list_splice_init(&lc->mark_list, &mark_list);
			
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 
				 {
			
 
				 	unsigned long flags;
			
 
				 	struct log_c *lc = log->context;
			
 
				-	struct flush_entry *fe;
			
 
				+	struct dm_dirty_log_flush_entry *fe;
			
 
				 
			
 
				 	/* Wait for an allocation, but _never_ fail */
			
 
				-	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
			
 
				+	fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
			
 
				 	BUG_ON(!fe);
			
 
				 
			
 
				 	spin_lock_irqsave(&lc->flush_lock, flags);
			
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 
				 {
			
 
				 	unsigned long flags;
			
 
				 	struct log_c *lc = log->context;
			
 
				-	struct flush_entry *fe;
			
 
				+	struct dm_dirty_log_flush_entry *fe;
			
 
				 
			
 
				 	/*
			
 
				 	 * If we fail to allocate, we skip the clearing of
			
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 
				 	 * to cause the region to be resync'ed when the
			
 
				 	 * device is activated next time.
			
 
				 	 */
			
 
				-	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
			
 
				+	fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
			
 
				 	if (!fe) {
			
 
				 		DMERR("Failed to allocate memory to clear region.");
			
 
				 		return;
			
@@ -733,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 
				 static void userspace_set_region_sync(struct dm_dirty_log *log,
			
 
				 				      region_t region, int in_sync)
			
 
				 {
			
 
				-	int r;
			
 
				 	struct log_c *lc = log->context;
			
 
				 	struct {
			
 
				 		region_t r;
			
@@ -743,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
 
				 	pkg.r = region;
			
 
				 	pkg.i = (int64_t)in_sync;
			
 
				 
			
 
				-	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
			
 
				-				 (char *)&pkg, sizeof(pkg), NULL, NULL);
			
 
				+	(void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
			
 
				+				    (char *)&pkg, sizeof(pkg), NULL, NULL);
			
 
				 
			
 
				 	/*
			
 
				 	 * It would be nice to be able to report failures.
			
 
				-	 * However, it is easy emough to detect and resolve.
			
 
				+	 * However, it is easy enough to detect and resolve.
			
 
				 	 */
			
 
				 	return;
			
 
				 }
			
@@ -886,18 +893,16 @@ static int __init userspace_dirty_log_init(void)
 
				 {
			
 
				 	int r = 0;
			
 
				 
			
 
				-	flush_entry_pool = mempool_create(100, flush_entry_alloc,
			
 
				-					  flush_entry_free, NULL);
			
 
				-
			
 
				-	if (!flush_entry_pool) {
			
 
				-		DMWARN("Unable to create flush_entry_pool:  No memory.");
			
 
				+	_flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
			
 
				+	if (!_flush_entry_cache) {
			
 
				+		DMWARN("Unable to create flush_entry_cache: No memory.");
			
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				 	r = dm_ulog_tfr_init();
			
 
				 	if (r) {
			
 
				 		DMWARN("Unable to initialize userspace log communications");
			
 
				-		mempool_destroy(flush_entry_pool);
			
 
				+		kmem_cache_destroy(_flush_entry_cache);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
@@ -905,7 +910,7 @@ static int __init userspace_dirty_log_init(void)
 
				 	if (r) {
			
 
				 		DMWARN("Couldn't register userspace dirty log type");
			
 
				 		dm_ulog_tfr_exit();
			
 
				-		mempool_destroy(flush_entry_pool);
			
 
				+		kmem_cache_destroy(_flush_entry_cache);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
@@ -917,7 +922,7 @@ static void __exit userspace_dirty_log_exit(void)
 
				 {
			
 
				 	dm_dirty_log_type_unregister(&_userspace_type);
			
 
				 	dm_ulog_tfr_exit();
			
 
				-	mempool_destroy(flush_entry_pool);
			
 
				+	kmem_cache_destroy(_flush_entry_cache);
			
 
				 
			
 
				 	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
			
 
				 	return;
			
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 
				 			 char *rdata, size_t *rdata_size)
			
 
				 {
			
 
				 	int r = 0;
			
 
				+	unsigned long tmo;
			
 
				 	size_t dummy = 0;
			
 
				 	int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
			
 
				 	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
			
@@ -236,11 +237,11 @@ resend:
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
			
 
				+	tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
			
 
				 	spin_lock(&receiving_list_lock);
			
 
				 	list_del_init(&(pkg.list));
			
 
				 	spin_unlock(&receiving_list_lock);
			
 
				-	if (!r) {
			
 
				+	if (!tmo) {
			
 
				 		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
			
 
				 		       (strlen(uuid) > 8) ?
			
 
				 		       (uuid + (strlen(uuid) - 8)) : (uuid),
			
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -0,0 +1,825 @@
 
				+/*
			
 
				+ * Copyright (C) 2014 Facebook. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/device-mapper.h>
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/blkdev.h>
			
 
				+#include <linux/bio.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/freezer.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX "log-writes"
			
 
				+
			
 
				+/*
			
 
				+ * This target will sequentially log all writes to the target device onto the
			
 
				+ * log device.  This is helpful for replaying writes to check for fs consistency
			
 
				+ * at all times.  This target provides a mechanism to mark specific events to
			
 
				+ * check data at a later time.  So for example you would:
			
 
				+ *
			
 
				+ * write data
			
 
				+ * fsync
			
 
				+ * dmsetup message /dev/whatever mark mymark
			
 
				+ * unmount /mnt/test
			
 
				+ *
			
 
				+ * Then replay the log up to mymark and check the contents of the replay to
			
 
				+ * verify it matches what was written.
			
 
				+ *
			
 
				+ * We log writes only after they have been flushed, this makes the log describe
			
 
				+ * close to the order in which the data hits the actual disk, not its cache.  So
			
 
				+ * for example the following sequence (W means write, C means complete)
			
 
				+ *
			
 
				+ * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
			
 
				+ *
			
 
				+ * Would result in the log looking like this:
			
 
				+ *
			
 
				+ * c,a,flush,fuad,b,<other writes>,<next flush>
			
 
				+ *
			
 
				+ * This is meant to help expose problems where file systems do not properly wait
			
 
				+ * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
			
 
				+ * completes it is added to the log as it should be on disk.
			
 
				+ *
			
 
				+ * We treat DISCARDs as if they don't bypass cache so that they are logged in
			
 
				+ * order of completion along with the normal writes.  If we didn't do it this
			
 
				+ * way we would process all the discards first and then write all the data, when
			
 
				+ * in fact we want to do the data and the discard in the order that they
			
 
				+ * completed.
			
 
				+ */
			
 
				+#define LOG_FLUSH_FLAG (1 << 0)
			
 
				+#define LOG_FUA_FLAG (1 << 1)
			
 
				+#define LOG_DISCARD_FLAG (1 << 2)
			
 
				+#define LOG_MARK_FLAG (1 << 3)
			
 
				+
			
 
				+#define WRITE_LOG_VERSION 1
			
 
				+#define WRITE_LOG_MAGIC 0x6a736677736872
			
 
				+
			
 
				+/*
			
 
				+ * The disk format for this is braindead simple.
			
 
				+ *
			
 
				+ * At byte 0 we have our super, followed by the following sequence for
			
 
				+ * nr_entries:
			
 
				+ *
			
 
				+ * [   1 sector    ][  entry->nr_sectors ]
			
 
				+ * [log_write_entry][    data written    ]
			
 
				+ *
			
 
				+ * The log_write_entry takes up a full sector so we can have arbitrary length
			
 
				+ * marks and it leaves us room for extra content in the future.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Basic info about the log for userspace.
			
 
				+ */
			
 
				+struct log_write_super {
			
 
				+	__le64 magic;
			
 
				+	__le64 version;
			
 
				+	__le64 nr_entries;
			
 
				+	__le32 sectorsize;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * sector - the sector we wrote.
			
 
				+ * nr_sectors - the number of sectors we wrote.
			
 
				+ * flags - flags for this log entry.
			
 
				+ * data_len - the size of the data in this log entry, this is for private log
			
 
				+ * entry stuff, the MARK data provided by userspace for example.
			
 
				+ */
			
 
				+struct log_write_entry {
			
 
				+	__le64 sector;
			
 
				+	__le64 nr_sectors;
			
 
				+	__le64 flags;
			
 
				+	__le64 data_len;
			
 
				+};
			
 
				+
			
 
				+struct log_writes_c {
			
 
				+	struct dm_dev *dev;
			
 
				+	struct dm_dev *logdev;
			
 
				+	u64 logged_entries;
			
 
				+	u32 sectorsize;
			
 
				+	atomic_t io_blocks;
			
 
				+	atomic_t pending_blocks;
			
 
				+	sector_t next_sector;
			
 
				+	sector_t end_sector;
			
 
				+	bool logging_enabled;
			
 
				+	bool device_supports_discard;
			
 
				+	spinlock_t blocks_lock;
			
 
				+	struct list_head unflushed_blocks;
			
 
				+	struct list_head logging_blocks;
			
 
				+	wait_queue_head_t wait;
			
 
				+	struct task_struct *log_kthread;
			
 
				+};
			
 
				+
			
 
				+struct pending_block {
			
 
				+	int vec_cnt;
			
 
				+	u64 flags;
			
 
				+	sector_t sector;
			
 
				+	sector_t nr_sectors;
			
 
				+	char *data;
			
 
				+	u32 datalen;
			
 
				+	struct list_head list;
			
 
				+	struct bio_vec vecs[0];
			
 
				+};
			
 
				+
			
 
				+struct per_bio_data {
			
 
				+	struct pending_block *block;
			
 
				+};
			
 
				+
			
 
				+static void put_pending_block(struct log_writes_c *lc)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&lc->pending_blocks)) {
			
 
				+		smp_mb__after_atomic();
			
 
				+		if (waitqueue_active(&lc->wait))
			
 
				+			wake_up(&lc->wait);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void put_io_block(struct log_writes_c *lc)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&lc->io_blocks)) {
			
 
				+		smp_mb__after_atomic();
			
 
				+		if (waitqueue_active(&lc->wait))
			
 
				+			wake_up(&lc->wait);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void log_end_io(struct bio *bio, int err)
			
 
				+{
			
 
				+	struct log_writes_c *lc = bio->bi_private;
			
 
				+	struct bio_vec *bvec;
			
 
				+	int i;
			
 
				+
			
 
				+	if (err) {
			
 
				+		unsigned long flags;
			
 
				+
			
 
				+		DMERR("Error writing log block, error=%d", err);
			
 
				+		spin_lock_irqsave(&lc->blocks_lock, flags);
			
 
				+		lc->logging_enabled = false;
			
 
				+		spin_unlock_irqrestore(&lc->blocks_lock, flags);
			
 
				+	}
			
 
				+
			
 
				+	bio_for_each_segment_all(bvec, bio, i)
			
 
				+		__free_page(bvec->bv_page);
			
 
				+
			
 
				+	put_io_block(lc);
			
 
				+	bio_put(bio);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Meant to be called if there is an error, it will free all the pages
			
 
				+ * associated with the block.
			
 
				+ */
			
 
				+static void free_pending_block(struct log_writes_c *lc,
			
 
				+			       struct pending_block *block)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < block->vec_cnt; i++) {
			
 
				+		if (block->vecs[i].bv_page)
			
 
				+			__free_page(block->vecs[i].bv_page);
			
 
				+	}
			
 
				+	kfree(block->data);
			
 
				+	kfree(block);
			
 
				+	put_pending_block(lc);
			
 
				+}
			
 
				+
			
 
				+static int write_metadata(struct log_writes_c *lc, void *entry,
			
 
				+			  size_t entrylen, void *data, size_t datalen,
			
 
				+			  sector_t sector)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct page *page;
			
 
				+	void *ptr;
			
 
				+	size_t ret;
			
 
				+
			
 
				+	bio = bio_alloc(GFP_KERNEL, 1);
			
 
				+	if (!bio) {
			
 
				+		DMERR("Couldn't alloc log bio");
			
 
				+		goto error;
			
 
				+	}
			
 
				+	bio->bi_iter.bi_size = 0;
			
 
				+	bio->bi_iter.bi_sector = sector;
			
 
				+	bio->bi_bdev = lc->logdev->bdev;
			
 
				+	bio->bi_end_io = log_end_io;
			
 
				+	bio->bi_private = lc;
			
 
				+	set_bit(BIO_UPTODATE, &bio->bi_flags);
			
 
				+
			
 
				+	page = alloc_page(GFP_KERNEL);
			
 
				+	if (!page) {
			
 
				+		DMERR("Couldn't alloc log page");
			
 
				+		bio_put(bio);
			
 
				+		goto error;
			
 
				+	}
			
 
				+
			
 
				+	ptr = kmap_atomic(page);
			
 
				+	memcpy(ptr, entry, entrylen);
			
 
				+	if (datalen)
			
 
				+		memcpy(ptr + entrylen, data, datalen);
			
 
				+	memset(ptr + entrylen + datalen, 0,
			
 
				+	       lc->sectorsize - entrylen - datalen);
			
 
				+	kunmap_atomic(ptr);
			
 
				+
			
 
				+	ret = bio_add_page(bio, page, lc->sectorsize, 0);
			
 
				+	if (ret != lc->sectorsize) {
			
 
				+		DMERR("Couldn't add page to the log block");
			
 
				+		goto error_bio;
			
 
				+	}
			
 
				+	submit_bio(WRITE, bio);
			
 
				+	return 0;
			
 
				+error_bio:
			
 
				+	bio_put(bio);
			
 
				+	__free_page(page);
			
 
				+error:
			
 
				+	put_io_block(lc);
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static int log_one_block(struct log_writes_c *lc,
			
 
				+			 struct pending_block *block, sector_t sector)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct log_write_entry entry;
			
 
				+	size_t ret;
			
 
				+	int i;
			
 
				+
			
 
				+	entry.sector = cpu_to_le64(block->sector);
			
 
				+	entry.nr_sectors = cpu_to_le64(block->nr_sectors);
			
 
				+	entry.flags = cpu_to_le64(block->flags);
			
 
				+	entry.data_len = cpu_to_le64(block->datalen);
			
 
				+	if (write_metadata(lc, &entry, sizeof(entry), block->data,
			
 
				+			   block->datalen, sector)) {
			
 
				+		free_pending_block(lc, block);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	if (!block->vec_cnt)
			
 
				+		goto out;
			
 
				+	sector++;
			
 
				+
			
 
				+	bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
			
 
				+	if (!bio) {
			
 
				+		DMERR("Couldn't alloc log bio");
			
 
				+		goto error;
			
 
				+	}
			
 
				+	atomic_inc(&lc->io_blocks);
			
 
				+	bio->bi_iter.bi_size = 0;
			
 
				+	bio->bi_iter.bi_sector = sector;
			
 
				+	bio->bi_bdev = lc->logdev->bdev;
			
 
				+	bio->bi_end_io = log_end_io;
			
 
				+	bio->bi_private = lc;
			
 
				+	set_bit(BIO_UPTODATE, &bio->bi_flags);
			
 
				+
			
 
				+	for (i = 0; i < block->vec_cnt; i++) {
			
 
				+		/*
			
 
				+		 * The page offset is always 0 because we allocate a new page
			
 
				+		 * for every bvec in the original bio for simplicity sake.
			
 
				+		 */
			
 
				+		ret = bio_add_page(bio, block->vecs[i].bv_page,
			
 
				+				   block->vecs[i].bv_len, 0);
			
 
				+		if (ret != block->vecs[i].bv_len) {
			
 
				+			atomic_inc(&lc->io_blocks);
			
 
				+			submit_bio(WRITE, bio);
			
 
				+			bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
			
 
				+			if (!bio) {
			
 
				+				DMERR("Couldn't alloc log bio");
			
 
				+				goto error;
			
 
				+			}
			
 
				+			bio->bi_iter.bi_size = 0;
			
 
				+			bio->bi_iter.bi_sector = sector;
			
 
				+			bio->bi_bdev = lc->logdev->bdev;
			
 
				+			bio->bi_end_io = log_end_io;
			
 
				+			bio->bi_private = lc;
			
 
				+			set_bit(BIO_UPTODATE, &bio->bi_flags);
			
 
				+
			
 
				+			ret = bio_add_page(bio, block->vecs[i].bv_page,
			
 
				+					   block->vecs[i].bv_len, 0);
			
 
				+			if (ret != block->vecs[i].bv_len) {
			
 
				+				DMERR("Couldn't add page on new bio?");
			
 
				+				bio_put(bio);
			
 
				+				goto error;
			
 
				+			}
			
 
				+		}
			
 
				+		sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
			
 
				+	}
			
 
				+	submit_bio(WRITE, bio);
			
 
				+out:
			
 
				+	kfree(block->data);
			
 
				+	kfree(block);
			
 
				+	put_pending_block(lc);
			
 
				+	return 0;
			
 
				+error:
			
 
				+	free_pending_block(lc, block);
			
 
				+	put_io_block(lc);
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static int log_super(struct log_writes_c *lc)
			
 
				+{
			
 
				+	struct log_write_super super;
			
 
				+
			
 
				+	super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
			
 
				+	super.version = cpu_to_le64(WRITE_LOG_VERSION);
			
 
				+	super.nr_entries = cpu_to_le64(lc->logged_entries);
			
 
				+	super.sectorsize = cpu_to_le32(lc->sectorsize);
			
 
				+
			
 
				+	if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
			
 
				+		DMERR("Couldn't write super");
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline sector_t logdev_last_sector(struct log_writes_c *lc)
			
 
				+{
			
 
				+	return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
			
 
				+}
			
 
				+
			
 
				+static int log_writes_kthread(void *arg)
			
 
				+{
			
 
				+	struct log_writes_c *lc = (struct log_writes_c *)arg;
			
 
				+	sector_t sector = 0;
			
 
				+
			
 
				+	while (!kthread_should_stop()) {
			
 
				+		bool super = false;
			
 
				+		bool logging_enabled;
			
 
				+		struct pending_block *block = NULL;
			
 
				+		int ret;
			
 
				+
			
 
				+		spin_lock_irq(&lc->blocks_lock);
			
 
				+		if (!list_empty(&lc->logging_blocks)) {
			
 
				+			block = list_first_entry(&lc->logging_blocks,
			
 
				+						 struct pending_block, list);
			
 
				+			list_del_init(&block->list);
			
 
				+			if (!lc->logging_enabled)
			
 
				+				goto next;
			
 
				+
			
 
				+			sector = lc->next_sector;
			
 
				+			if (block->flags & LOG_DISCARD_FLAG)
			
 
				+				lc->next_sector++;
			
 
				+			else
			
 
				+				lc->next_sector += block->nr_sectors + 1;
			
 
				+
			
 
				+			/*
			
 
				+			 * Apparently the size of the device may not be known
			
 
				+			 * right away, so handle this properly.
			
 
				+			 */
			
 
				+			if (!lc->end_sector)
			
 
				+				lc->end_sector = logdev_last_sector(lc);
			
 
				+			if (lc->end_sector &&
			
 
				+			    lc->next_sector >= lc->end_sector) {
			
 
				+				DMERR("Ran out of space on the logdev");
			
 
				+				lc->logging_enabled = false;
			
 
				+				goto next;
			
 
				+			}
			
 
				+			lc->logged_entries++;
			
 
				+			atomic_inc(&lc->io_blocks);
			
 
				+
			
 
				+			super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
			
 
				+			if (super)
			
 
				+				atomic_inc(&lc->io_blocks);
			
 
				+		}
			
 
				+next:
			
 
				+		logging_enabled = lc->logging_enabled;
			
 
				+		spin_unlock_irq(&lc->blocks_lock);
			
 
				+		if (block) {
			
 
				+			if (logging_enabled) {
			
 
				+				ret = log_one_block(lc, block, sector);
			
 
				+				if (!ret && super)
			
 
				+					ret = log_super(lc);
			
 
				+				if (ret) {
			
 
				+					spin_lock_irq(&lc->blocks_lock);
			
 
				+					lc->logging_enabled = false;
			
 
				+					spin_unlock_irq(&lc->blocks_lock);
			
 
				+				}
			
 
				+			} else
			
 
				+				free_pending_block(lc, block);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (!try_to_freeze()) {
			
 
				+			set_current_state(TASK_INTERRUPTIBLE);
			
 
				+			if (!kthread_should_stop() &&
			
 
				+			    !atomic_read(&lc->pending_blocks))
			
 
				+				schedule();
			
 
				+			__set_current_state(TASK_RUNNING);
			
 
				+		}
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Construct a log-writes mapping:
			
 
				+ * log-writes <dev_path> <log_dev_path>
			
 
				+ */
			
 
				+static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
			
 
				+{
			
 
				+	struct log_writes_c *lc;
			
 
				+	struct dm_arg_set as;
			
 
				+	const char *devname, *logdevname;
			
 
				+
			
 
				+	as.argc = argc;
			
 
				+	as.argv = argv;
			
 
				+
			
 
				+	if (argc < 2) {
			
 
				+		ti->error = "Invalid argument count";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
			
 
				+	if (!lc) {
			
 
				+		ti->error = "Cannot allocate context";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	spin_lock_init(&lc->blocks_lock);
			
 
				+	INIT_LIST_HEAD(&lc->unflushed_blocks);
			
 
				+	INIT_LIST_HEAD(&lc->logging_blocks);
			
 
				+	init_waitqueue_head(&lc->wait);
			
 
				+	lc->sectorsize = 1 << SECTOR_SHIFT;
			
 
				+	atomic_set(&lc->io_blocks, 0);
			
 
				+	atomic_set(&lc->pending_blocks, 0);
			
 
				+
			
 
				+	devname = dm_shift_arg(&as);
			
 
				+	if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
			
 
				+		ti->error = "Device lookup failed";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	logdevname = dm_shift_arg(&as);
			
 
				+	if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
			
 
				+		ti->error = "Log device lookup failed";
			
 
				+		dm_put_device(ti, lc->dev);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
			
 
				+	if (!lc->log_kthread) {
			
 
				+		ti->error = "Couldn't alloc kthread";
			
 
				+		dm_put_device(ti, lc->dev);
			
 
				+		dm_put_device(ti, lc->logdev);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	/* We put the super at sector 0, start logging at sector 1 */
			
 
				+	lc->next_sector = 1;
			
 
				+	lc->logging_enabled = true;
			
 
				+	lc->end_sector = logdev_last_sector(lc);
			
 
				+	lc->device_supports_discard = true;
			
 
				+
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->flush_supported = true;
			
 
				+	ti->num_discard_bios = 1;
			
 
				+	ti->discards_supported = true;
			
 
				+	ti->per_bio_data_size = sizeof(struct per_bio_data);
			
 
				+	ti->private = lc;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	kfree(lc);
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int log_mark(struct log_writes_c *lc, char *data)
			
 
				+{
			
 
				+	struct pending_block *block;
			
 
				+	size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
			
 
				+
			
 
				+	block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
			
 
				+	if (!block) {
			
 
				+		DMERR("Error allocating pending block");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	block->data = kstrndup(data, maxsize, GFP_KERNEL);
			
 
				+	if (!block->data) {
			
 
				+		DMERR("Error copying mark data");
			
 
				+		kfree(block);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	atomic_inc(&lc->pending_blocks);
			
 
				+	block->datalen = strlen(block->data);
			
 
				+	block->flags |= LOG_MARK_FLAG;
			
 
				+	spin_lock_irq(&lc->blocks_lock);
			
 
				+	list_add_tail(&block->list, &lc->logging_blocks);
			
 
				+	spin_unlock_irq(&lc->blocks_lock);
			
 
				+	wake_up_process(lc->log_kthread);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void log_writes_dtr(struct dm_target *ti)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+
			
 
				+	spin_lock_irq(&lc->blocks_lock);
			
 
				+	list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
			
 
				+	spin_unlock_irq(&lc->blocks_lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * This is just nice to have since it'll update the super to include the
			
 
				+	 * unflushed blocks, if it fails we don't really care.
			
 
				+	 */
			
 
				+	log_mark(lc, "dm-log-writes-end");
			
 
				+	wake_up_process(lc->log_kthread);
			
 
				+	wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
			
 
				+		   !atomic_read(&lc->pending_blocks));
			
 
				+	kthread_stop(lc->log_kthread);
			
 
				+
			
 
				+	WARN_ON(!list_empty(&lc->logging_blocks));
			
 
				+	WARN_ON(!list_empty(&lc->unflushed_blocks));
			
 
				+	dm_put_device(ti, lc->dev);
			
 
				+	dm_put_device(ti, lc->logdev);
			
 
				+	kfree(lc);
			
 
				+}
			
 
				+
			
 
				+static void normal_map_bio(struct dm_target *ti, struct bio *bio)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+
			
 
				+	bio->bi_bdev = lc->dev->bdev;
			
 
				+}
			
 
				+
			
 
				+static int log_writes_map(struct dm_target *ti, struct bio *bio)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
			
 
				+	struct pending_block *block;
			
 
				+	struct bvec_iter iter;
			
 
				+	struct bio_vec bv;
			
 
				+	size_t alloc_size;
			
 
				+	int i = 0;
			
 
				+	bool flush_bio = (bio->bi_rw & REQ_FLUSH);
			
 
				+	bool fua_bio = (bio->bi_rw & REQ_FUA);
			
 
				+	bool discard_bio = (bio->bi_rw & REQ_DISCARD);
			
 
				+
			
 
				+	pb->block = NULL;
			
 
				+
			
 
				+	/* Don't bother doing anything if logging has been disabled */
			
 
				+	if (!lc->logging_enabled)
			
 
				+		goto map_bio;
			
 
				+
			
 
				+	/*
			
 
				+	 * Map reads as normal.
			
 
				+	 */
			
 
				+	if (bio_data_dir(bio) == READ)
			
 
				+		goto map_bio;
			
 
				+
			
 
				+	/* No sectors and not a flush?  Don't care */
			
 
				+	if (!bio_sectors(bio) && !flush_bio)
			
 
				+		goto map_bio;
			
 
				+
			
 
				+	/*
			
 
				+	 * Discards will have bi_size set but there's no actual data, so just
			
 
				+	 * allocate the size of the pending block.
			
 
				+	 */
			
 
				+	if (discard_bio)
			
 
				+		alloc_size = sizeof(struct pending_block);
			
 
				+	else
			
 
				+		alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
			
 
				+
			
 
				+	block = kzalloc(alloc_size, GFP_NOIO);
			
 
				+	if (!block) {
			
 
				+		DMERR("Error allocating pending block");
			
 
				+		spin_lock_irq(&lc->blocks_lock);
			
 
				+		lc->logging_enabled = false;
			
 
				+		spin_unlock_irq(&lc->blocks_lock);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	INIT_LIST_HEAD(&block->list);
			
 
				+	pb->block = block;
			
 
				+	atomic_inc(&lc->pending_blocks);
			
 
				+
			
 
				+	if (flush_bio)
			
 
				+		block->flags |= LOG_FLUSH_FLAG;
			
 
				+	if (fua_bio)
			
 
				+		block->flags |= LOG_FUA_FLAG;
			
 
				+	if (discard_bio)
			
 
				+		block->flags |= LOG_DISCARD_FLAG;
			
 
				+
			
 
				+	block->sector = bio->bi_iter.bi_sector;
			
 
				+	block->nr_sectors = bio_sectors(bio);
			
 
				+
			
 
				+	/* We don't need the data, just submit */
			
 
				+	if (discard_bio) {
			
 
				+		WARN_ON(flush_bio || fua_bio);
			
 
				+		if (lc->device_supports_discard)
			
 
				+			goto map_bio;
			
 
				+		bio_endio(bio, 0);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	/* Flush bio, splice the unflushed blocks onto this list and submit */
			
 
				+	if (flush_bio && !bio_sectors(bio)) {
			
 
				+		spin_lock_irq(&lc->blocks_lock);
			
 
				+		list_splice_init(&lc->unflushed_blocks, &block->list);
			
 
				+		spin_unlock_irq(&lc->blocks_lock);
			
 
				+		goto map_bio;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * We will write this bio somewhere else way later so we need to copy
			
 
				+	 * the actual contents into new pages so we know the data will always be
			
 
				+	 * there.
			
 
				+	 *
			
 
				+	 * We do this because this could be a bio from O_DIRECT in which case we
			
 
				+	 * can't just hold onto the page until some later point, we have to
			
 
				+	 * manually copy the contents.
			
 
				+	 */
			
 
				+	bio_for_each_segment(bv, bio, iter) {
			
 
				+		struct page *page;
			
 
				+		void *src, *dst;
			
 
				+
			
 
				+		page = alloc_page(GFP_NOIO);
			
 
				+		if (!page) {
			
 
				+			DMERR("Error allocing page");
			
 
				+			free_pending_block(lc, block);
			
 
				+			spin_lock_irq(&lc->blocks_lock);
			
 
				+			lc->logging_enabled = false;
			
 
				+			spin_unlock_irq(&lc->blocks_lock);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+
			
 
				+		src = kmap_atomic(bv.bv_page);
			
 
				+		dst = kmap_atomic(page);
			
 
				+		memcpy(dst, src + bv.bv_offset, bv.bv_len);
			
 
				+		kunmap_atomic(dst);
			
 
				+		kunmap_atomic(src);
			
 
				+		block->vecs[i].bv_page = page;
			
 
				+		block->vecs[i].bv_len = bv.bv_len;
			
 
				+		block->vec_cnt++;
			
 
				+		i++;
			
 
				+	}
			
 
				+
			
 
				+	/* Had a flush with data in it, weird */
			
 
				+	if (flush_bio) {
			
 
				+		spin_lock_irq(&lc->blocks_lock);
			
 
				+		list_splice_init(&lc->unflushed_blocks, &block->list);
			
 
				+		spin_unlock_irq(&lc->blocks_lock);
			
 
				+	}
			
 
				+map_bio:
			
 
				+	normal_map_bio(ti, bio);
			
 
				+	return DM_MAPIO_REMAPPED;
			
 
				+}
			
 
				+
			
 
				+static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
			
 
				+
			
 
				+	if (bio_data_dir(bio) == WRITE && pb->block) {
			
 
				+		struct pending_block *block = pb->block;
			
 
				+		unsigned long flags;
			
 
				+
			
 
				+		spin_lock_irqsave(&lc->blocks_lock, flags);
			
 
				+		if (block->flags & LOG_FLUSH_FLAG) {
			
 
				+			list_splice_tail_init(&block->list, &lc->logging_blocks);
			
 
				+			list_add_tail(&block->list, &lc->logging_blocks);
			
 
				+			wake_up_process(lc->log_kthread);
			
 
				+		} else if (block->flags & LOG_FUA_FLAG) {
			
 
				+			list_add_tail(&block->list, &lc->logging_blocks);
			
 
				+			wake_up_process(lc->log_kthread);
			
 
				+		} else
			
 
				+			list_add_tail(&block->list, &lc->unflushed_blocks);
			
 
				+		spin_unlock_irqrestore(&lc->blocks_lock, flags);
			
 
				+	}
			
 
				+
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * INFO format: <logged entries> <highest allocated sector>
			
 
				+ */
			
 
				+static void log_writes_status(struct dm_target *ti, status_type_t type,
			
 
				+			      unsigned status_flags, char *result,
			
 
				+			      unsigned maxlen)
			
 
				+{
			
 
				+	unsigned sz = 0;
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case STATUSTYPE_INFO:
			
 
				+		DMEMIT("%llu %llu", lc->logged_entries,
			
 
				+		       (unsigned long long)lc->next_sector - 1);
			
 
				+		if (!lc->logging_enabled)
			
 
				+			DMEMIT(" logging_disabled");
			
 
				+		break;
			
 
				+
			
 
				+	case STATUSTYPE_TABLE:
			
 
				+		DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
			
 
				+			    unsigned long arg)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+	struct dm_dev *dev = lc->dev;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Only pass ioctls through if the device sizes match exactly.
			
 
				+	 */
			
 
				+	if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
			
 
				+		r = scsi_verify_blk_ioctl(NULL, cmd);
			
 
				+
			
 
				+	return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
			
 
				+}
			
 
				+
			
 
				+static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
			
 
				+			    struct bio_vec *biovec, int max_size)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
			
 
				+
			
 
				+	if (!q->merge_bvec_fn)
			
 
				+		return max_size;
			
 
				+
			
 
				+	bvm->bi_bdev = lc->dev->bdev;
			
 
				+	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
			
 
				+
			
 
				+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
			
 
				+}
			
 
				+
			
 
				+static int log_writes_iterate_devices(struct dm_target *ti,
			
 
				+				      iterate_devices_callout_fn fn,
			
 
				+				      void *data)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+
			
 
				+	return fn(ti, lc->dev, 0, ti->len, data);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Messages supported:
			
 
				+ *   mark <mark data> - specify the marked data.
			
 
				+ */
			
 
				+static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
			
 
				+{
			
 
				+	int r = -EINVAL;
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+
			
 
				+	if (argc != 2) {
			
 
				+		DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	if (!strcasecmp(argv[0], "mark"))
			
 
				+		r = log_mark(lc, argv[1]);
			
 
				+	else
			
 
				+		DMWARN("Unrecognised log writes target message received: %s", argv[0]);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
 
				+{
			
 
				+	struct log_writes_c *lc = ti->private;
			
 
				+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
			
 
				+
			
 
				+	if (!q || !blk_queue_discard(q)) {
			
 
				+		lc->device_supports_discard = false;
			
 
				+		limits->discard_granularity = 1 << SECTOR_SHIFT;
			
 
				+		limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct target_type log_writes_target = {
			
 
				+	.name   = "log-writes",
			
 
				+	.version = {1, 0, 0},
			
 
				+	.module = THIS_MODULE,
			
 
				+	.ctr    = log_writes_ctr,
			
 
				+	.dtr    = log_writes_dtr,
			
 
				+	.map    = log_writes_map,
			
 
				+	.end_io = normal_end_io,
			
 
				+	.status = log_writes_status,
			
 
				+	.ioctl	= log_writes_ioctl,
			
 
				+	.merge	= log_writes_merge,
			
 
				+	.message = log_writes_message,
			
 
				+	.iterate_devices = log_writes_iterate_devices,
			
 
				+	.io_hints = log_writes_io_hints,
			
 
				+};
			
 
				+
			
 
				+static int __init dm_log_writes_init(void)
			
 
				+{
			
 
				+	int r = dm_register_target(&log_writes_target);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("register failed %d", r);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __exit dm_log_writes_exit(void)
			
 
				+{
			
 
				+	dm_unregister_target(&log_writes_target);
			
 
				+}
			
 
				+
			
 
				+module_init(dm_log_writes_init);
			
 
				+module_exit(dm_log_writes_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION(DM_NAME " log writes target");
			
 
				+MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 
				 	} else {
			
 
				 		/* blk-mq request-based interface */
			
 
				 		*__clone = blk_get_request(bdev_get_queue(bdev),
			
 
				-					   rq_data_dir(rq), GFP_KERNEL);
			
 
				+					   rq_data_dir(rq), GFP_ATOMIC);
			
 
				 		if (IS_ERR(*__clone))
			
 
				 			/* ENOMEM, requeue */
			
 
				 			return r;
			
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
 
				 {
			
 
				 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
			
 
				 
			
 
				-	return dm_underlying_device_busy(q);
			
 
				+	return blk_lld_busy(q);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1703,7 +1703,7 @@ out:
 
				  *---------------------------------------------------------------*/
			
 
				 static struct target_type multipath_target = {
			
 
				 	.name = "multipath",
			
 
				-	.version = {1, 8, 0},
			
 
				+	.version = {1, 9, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = multipath_ctr,
			
 
				 	.dtr = multipath_dtr,
			
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -11,7 +11,7 @@
 
				 struct dm_sysfs_attr {
			
 
				 	struct attribute attr;
			
 
				 	ssize_t (*show)(struct mapped_device *, char *);
			
 
				-	ssize_t (*store)(struct mapped_device *, char *);
			
 
				+	ssize_t (*store)(struct mapped_device *, const char *, size_t count);
			
 
				 };
			
 
				 
			
 
				 #define DM_ATTR_RO(_name) \
			
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+#define DM_ATTR_RW(_name) \
			
 
				+struct dm_sysfs_attr dm_attr_##_name = \
			
 
				+	__ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
			
 
				+
			
 
				+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
			
 
				+			     const char *page, size_t count)
			
 
				+{
			
 
				+	struct dm_sysfs_attr *dm_attr;
			
 
				+	struct mapped_device *md;
			
 
				+	ssize_t ret;
			
 
				+
			
 
				+	dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
			
 
				+	if (!dm_attr->store)
			
 
				+		return -EIO;
			
 
				+
			
 
				+	md = dm_get_from_kobject(kobj);
			
 
				+	if (!md)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	ret = dm_attr->store(md, page, count);
			
 
				+	dm_put(md);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
			
 
				 {
			
 
				 	if (dm_copy_name_and_uuid(md, buf, NULL))
			
@@ -64,25 +89,33 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 
				 	return strlen(buf);
			
 
				 }
			
 
				 
			
 
				+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
			
 
				+{
			
 
				+	sprintf(buf, "%d\n", dm_use_blk_mq(md));
			
 
				+
			
 
				+	return strlen(buf);
			
 
				+}
			
 
				+
			
 
				 static DM_ATTR_RO(name);
			
 
				 static DM_ATTR_RO(uuid);
			
 
				 static DM_ATTR_RO(suspended);
			
 
				+static DM_ATTR_RO(use_blk_mq);
			
 
				+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
			
 
				 
			
 
				 static struct attribute *dm_attrs[] = {
			
 
				 	&dm_attr_name.attr,
			
 
				 	&dm_attr_uuid.attr,
			
 
				 	&dm_attr_suspended.attr,
			
 
				+	&dm_attr_use_blk_mq.attr,
			
 
				+	&dm_attr_rq_based_seq_io_merge_deadline.attr,
			
 
				 	NULL,
			
 
				 };
			
 
				 
			
 
				 static const struct sysfs_ops dm_sysfs_ops = {
			
 
				 	.show	= dm_attr_show,
			
 
				+	.store	= dm_attr_store,
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * dm kobject is embedded in mapped_device structure
			
 
				- * no need to define release function here
			
 
				- */
			
 
				 static struct kobj_type dm_ktype = {
			
 
				 	.sysfs_ops	= &dm_sysfs_ops,
			
 
				 	.default_attrs	= dm_attrs,
			
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,6 +18,8 @@
 
				 #include <linux/mutex.h>
			
 
				 #include <linux/delay.h>
			
 
				 #include <linux/atomic.h>
			
 
				+#include <linux/blk-mq.h>
			
 
				+#include <linux/mount.h>
			
 
				 
			
 
				 #define DM_MSG_PREFIX "table"
			
 
				 
			
@@ -372,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 
				 	int r;
			
 
				 	dev_t uninitialized_var(dev);
			
 
				 	struct dm_dev_internal *dd;
			
 
				-	unsigned int major, minor;
			
 
				 	struct dm_table *t = ti->table;
			
 
				-	char dummy;
			
 
				+	struct block_device *bdev;
			
 
				 
			
 
				 	BUG_ON(!t);
			
 
				 
			
 
				-	if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
			
 
				-		/* Extract the major/minor numbers */
			
 
				-		dev = MKDEV(major, minor);
			
 
				-		if (MAJOR(dev) != major || MINOR(dev) != minor)
			
 
				-			return -EOVERFLOW;
			
 
				+	/* convert the path to a device */
			
 
				+	bdev = lookup_bdev(path);
			
 
				+	if (IS_ERR(bdev)) {
			
 
				+		dev = name_to_dev_t(path);
			
 
				+		if (!dev)
			
 
				+			return -ENODEV;
			
 
				 	} else {
			
 
				-		/* convert the path to a device */
			
 
				-		struct block_device *bdev = lookup_bdev(path);
			
 
				-
			
 
				-		if (IS_ERR(bdev))
			
 
				-			return PTR_ERR(bdev);
			
 
				 		dev = bdev->bd_dev;
			
 
				 		bdput(bdev);
			
 
				 	}
			
@@ -939,7 +936,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
 
				 	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
			
 
				 }
			
 
				 
			
 
				-static int dm_table_alloc_md_mempools(struct dm_table *t)
			
 
				+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
			
 
				 {
			
 
				 	unsigned type = dm_table_get_type(t);
			
 
				 	unsigned per_bio_data_size = 0;
			
@@ -957,7 +954,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
 
				 			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
			
 
				 		}
			
 
				 
			
 
				-	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
			
 
				+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
			
 
				 	if (!t->mempools)
			
 
				 		return -ENOMEM;
			
 
				 
			
@@ -1127,7 +1124,7 @@ int dm_table_complete(struct dm_table *t)
 
				 		return r;
			
 
				 	}
			
 
				 
			
 
				-	r = dm_table_alloc_md_mempools(t);
			
 
				+	r = dm_table_alloc_md_mempools(t, t->md);
			
 
				 	if (r)
			
 
				 		DMERR("unable to allocate mempools");
			
 
				 
			
@@ -1339,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 
				 			continue;
			
 
				 
			
 
				 		if (ti->flush_supported)
			
 
				-			return 1;
			
 
				+			return true;
			
 
				 
			
 
				 		if (ti->type->iterate_devices &&
			
 
				 		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
			
 
				-			return 1;
			
 
				+			return true;
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+	return false;
			
 
				 }
			
 
				 
			
 
				 static bool dm_table_discard_zeroes_data(struct dm_table *t)
			
@@ -1359,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
 
				 		ti = dm_table_get_target(t, i++);
			
 
				 
			
 
				 		if (ti->discard_zeroes_data_unsupported)
			
 
				-			return 0;
			
 
				+			return false;
			
 
				 	}
			
 
				 
			
 
				-	return 1;
			
 
				+	return true;
			
 
				 }
			
 
				 
			
 
				 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
			
@@ -1408,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 
				 
			
 
				 		if (!ti->type->iterate_devices ||
			
 
				 		    !ti->type->iterate_devices(ti, func, NULL))
			
 
				-			return 0;
			
 
				+			return false;
			
 
				 	}
			
 
				 
			
 
				-	return 1;
			
 
				+	return true;
			
 
				 }
			
 
				 
			
 
				 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
			
@@ -1468,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
 
				 			continue;
			
 
				 
			
 
				 		if (ti->discards_supported)
			
 
				-			return 1;
			
 
				+			return true;
			
 
				 
			
 
				 		if (ti->type->iterate_devices &&
			
 
				 		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
			
 
				-			return 1;
			
 
				+			return true;
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+	return false;
			
 
				 }
			
 
				 
			
 
				 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
			
@@ -1677,20 +1674,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-int dm_table_any_busy_target(struct dm_table *t)
			
 
				-{
			
 
				-	unsigned i;
			
 
				-	struct dm_target *ti;
			
 
				-
			
 
				-	for (i = 0; i < t->num_targets; i++) {
			
 
				-		ti = t->targets + i;
			
 
				-		if (ti->type->busy && ti->type->busy(ti))
			
 
				-			return 1;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 struct mapped_device *dm_table_get_md(struct dm_table *t)
			
 
				 {
			
 
				 	return t->md;
			
@@ -1709,9 +1692,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 
				 	md = dm_table_get_md(t);
			
 
				 	queue = dm_get_md_queue(md);
			
 
				 	if (queue) {
			
 
				-		spin_lock_irqsave(queue->queue_lock, flags);
			
 
				-		blk_run_queue_async(queue);
			
 
				-		spin_unlock_irqrestore(queue->queue_lock, flags);
			
 
				+		if (queue->mq_ops)
			
 
				+			blk_mq_run_hw_queues(queue, true);
			
 
				+		else {
			
 
				+			spin_lock_irqsave(queue->queue_lock, flags);
			
 
				+			blk_run_queue_async(queue);
			
 
				+			spin_unlock_irqrestore(queue->queue_lock, flags);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(dm_table_run_md_queue_async);
			
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -18,20 +18,39 @@
 
				 
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/device-mapper.h>
			
 
				+#include <linux/reboot.h>
			
 
				 #include <crypto/hash.h>
			
 
				 
			
 
				 #define DM_MSG_PREFIX			"verity"
			
 
				 
			
 
				+#define DM_VERITY_ENV_LENGTH		42
			
 
				+#define DM_VERITY_ENV_VAR_NAME		"DM_VERITY_ERR_BLOCK_NR"
			
 
				+
			
 
				 #define DM_VERITY_IO_VEC_INLINE		16
			
 
				 #define DM_VERITY_MEMPOOL_SIZE		4
			
 
				 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
			
 
				 
			
 
				 #define DM_VERITY_MAX_LEVELS		63
			
 
				+#define DM_VERITY_MAX_CORRUPTED_ERRS	100
			
 
				+
			
 
				+#define DM_VERITY_OPT_LOGGING		"ignore_corruption"
			
 
				+#define DM_VERITY_OPT_RESTART		"restart_on_corruption"
			
 
				 
			
 
				 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
			
 
				 
			
 
				 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
			
 
				 
			
 
				+enum verity_mode {
			
 
				+	DM_VERITY_MODE_EIO,
			
 
				+	DM_VERITY_MODE_LOGGING,
			
 
				+	DM_VERITY_MODE_RESTART
			
 
				+};
			
 
				+
			
 
				+enum verity_block_type {
			
 
				+	DM_VERITY_BLOCK_TYPE_DATA,
			
 
				+	DM_VERITY_BLOCK_TYPE_METADATA
			
 
				+};
			
 
				+
			
 
				 struct dm_verity {
			
 
				 	struct dm_dev *data_dev;
			
 
				 	struct dm_dev *hash_dev;
			
@@ -54,6 +73,8 @@ struct dm_verity {
 
				 	unsigned digest_size;	/* digest size for the current hash algorithm */
			
 
				 	unsigned shash_descsize;/* the size of temporary space for crypto */
			
 
				 	int hash_failed;	/* set to 1 if hash of any block failed */
			
 
				+	enum verity_mode mode;	/* mode for handling verification errors */
			
 
				+	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
			
 
				 
			
 
				 	mempool_t *vec_mempool;	/* mempool of bio vector */
			
 
				 
			
@@ -174,6 +195,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 
				 		*offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Handle verification errors.
			
 
				+ */
			
 
				+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
			
 
				+			     unsigned long long block)
			
 
				+{
			
 
				+	char verity_env[DM_VERITY_ENV_LENGTH];
			
 
				+	char *envp[] = { verity_env, NULL };
			
 
				+	const char *type_str = "";
			
 
				+	struct mapped_device *md = dm_table_get_md(v->ti->table);
			
 
				+
			
 
				+	/* Corruption should be visible in device status in all modes */
			
 
				+	v->hash_failed = 1;
			
 
				+
			
 
				+	if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
			
 
				+		goto out;
			
 
				+
			
 
				+	v->corrupted_errs++;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case DM_VERITY_BLOCK_TYPE_DATA:
			
 
				+		type_str = "data";
			
 
				+		break;
			
 
				+	case DM_VERITY_BLOCK_TYPE_METADATA:
			
 
				+		type_str = "metadata";
			
 
				+		break;
			
 
				+	default:
			
 
				+		BUG();
			
 
				+	}
			
 
				+
			
 
				+	DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
			
 
				+		block);
			
 
				+
			
 
				+	if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
			
 
				+		DMERR("%s: reached maximum errors", v->data_dev->name);
			
 
				+
			
 
				+	snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
			
 
				+		DM_VERITY_ENV_VAR_NAME, type, block);
			
 
				+
			
 
				+	kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
			
 
				+
			
 
				+out:
			
 
				+	if (v->mode == DM_VERITY_MODE_LOGGING)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (v->mode == DM_VERITY_MODE_RESTART)
			
 
				+		kernel_restart("dm-verity device corrupted");
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Verify hash of a metadata block pertaining to the specified data block
			
 
				  * ("block" argument) at a specified level ("level" argument).
			
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
 
				 			goto release_ret_r;
			
 
				 		}
			
 
				 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
			
 
				-			DMERR_LIMIT("metadata block %llu is corrupted",
			
 
				-				(unsigned long long)hash_block);
			
 
				-			v->hash_failed = 1;
			
 
				-			r = -EIO;
			
 
				-			goto release_ret_r;
			
 
				+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
			
 
				+					      hash_block)) {
			
 
				+				r = -EIO;
			
 
				+				goto release_ret_r;
			
 
				+			}
			
 
				 		} else
			
 
				 			aux->hash_verified = 1;
			
 
				 	}
			
@@ -367,10 +439,9 @@ test_block_hash:
 
				 			return r;
			
 
				 		}
			
 
				 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
			
 
				-			DMERR_LIMIT("data block %llu is corrupted",
			
 
				-				(unsigned long long)(io->block + b));
			
 
				-			v->hash_failed = 1;
			
 
				-			return -EIO;
			
 
				+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
			
 
				+					      io->block + b))
			
 
				+				return -EIO;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 
				 		else
			
 
				 			for (x = 0; x < v->salt_size; x++)
			
 
				 				DMEMIT("%02x", v->salt[x]);
			
 
				+		if (v->mode != DM_VERITY_MODE_EIO) {
			
 
				+			DMEMIT(" 1 ");
			
 
				+			switch (v->mode) {
			
 
				+			case DM_VERITY_MODE_LOGGING:
			
 
				+				DMEMIT(DM_VERITY_OPT_LOGGING);
			
 
				+				break;
			
 
				+			case DM_VERITY_MODE_RESTART:
			
 
				+				DMEMIT(DM_VERITY_OPT_RESTART);
			
 
				+				break;
			
 
				+			default:
			
 
				+				BUG();
			
 
				+			}
			
 
				+		}
			
 
				 		break;
			
 
				 	}
			
 
				 }
			
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
 
				 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
			
 
				 {
			
 
				 	struct dm_verity *v;
			
 
				-	unsigned num;
			
 
				+	struct dm_arg_set as;
			
 
				+	const char *opt_string;
			
 
				+	unsigned int num, opt_params;
			
 
				 	unsigned long long num_ll;
			
 
				 	int r;
			
 
				 	int i;
			
 
				 	sector_t hash_position;
			
 
				 	char dummy;
			
 
				 
			
 
				+	static struct dm_arg _args[] = {
			
 
				+		{0, 1, "Invalid number of feature args"},
			
 
				+	};
			
 
				+
			
 
				 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
			
 
				 	if (!v) {
			
 
				 		ti->error = "Cannot allocate verity structure";
			
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	if (argc != 10) {
			
 
				-		ti->error = "Invalid argument count: exactly 10 arguments required";
			
 
				+	if (argc < 10) {
			
 
				+		ti->error = "Not enough arguments";
			
 
				 		r = -EINVAL;
			
 
				 		goto bad;
			
 
				 	}
			
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	argv += 10;
			
 
				+	argc -= 10;
			
 
				+
			
 
				+	/* Optional parameters */
			
 
				+	if (argc) {
			
 
				+		as.argc = argc;
			
 
				+		as.argv = argv;
			
 
				+
			
 
				+		r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
			
 
				+		if (r)
			
 
				+			goto bad;
			
 
				+
			
 
				+		while (opt_params) {
			
 
				+			opt_params--;
			
 
				+			opt_string = dm_shift_arg(&as);
			
 
				+			if (!opt_string) {
			
 
				+				ti->error = "Not enough feature arguments";
			
 
				+				r = -EINVAL;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+
			
 
				+			if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
			
 
				+				v->mode = DM_VERITY_MODE_LOGGING;
			
 
				+			else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
			
 
				+				v->mode = DM_VERITY_MODE_RESTART;
			
 
				+			else {
			
 
				+				ti->error = "Invalid feature arguments";
			
 
				+				r = -EINVAL;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	v->hash_per_block_bits =
			
 
				 		__fls((1 << v->hash_dev_block_bits) / v->digest_size);
			
 
				 
			
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,9 @@
 
				 #include <linux/delay.h>
			
 
				 #include <linux/wait.h>
			
 
				 #include <linux/kthread.h>
			
 
				+#include <linux/ktime.h>
			
 
				+#include <linux/elevator.h> /* for rq_end_sector() */
			
 
				+#include <linux/blk-mq.h>
			
 
				 
			
 
				 #include <trace/events/block.h>
			
 
				 
			
@@ -216,8 +219,29 @@ struct mapped_device {
 
				 
			
 
				 	struct kthread_worker kworker;
			
 
				 	struct task_struct *kworker_task;
			
 
				+
			
 
				+	/* for request-based merge heuristic in dm_request_fn() */
			
 
				+	unsigned seq_rq_merge_deadline_usecs;
			
 
				+	int last_rq_rw;
			
 
				+	sector_t last_rq_pos;
			
 
				+	ktime_t last_rq_start_time;
			
 
				+
			
 
				+	/* for blk-mq request-based DM support */
			
 
				+	struct blk_mq_tag_set tag_set;
			
 
				+	bool use_blk_mq;
			
 
				 };
			
 
				 
			
 
				+#ifdef CONFIG_DM_MQ_DEFAULT
			
 
				+static bool use_blk_mq = true;
			
 
				+#else
			
 
				+static bool use_blk_mq = false;
			
 
				+#endif
			
 
				+
			
 
				+bool dm_use_blk_mq(struct mapped_device *md)
			
 
				+{
			
 
				+	return md->use_blk_mq;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * For mempools pre-allocation at the table loading time.
			
 
				  */
			
@@ -250,35 +274,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 
				  */
			
 
				 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
			
 
				 
			
 
				-static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
			
 
				+static unsigned __dm_get_module_param(unsigned *module_param,
			
 
				 				      unsigned def, unsigned max)
			
 
				 {
			
 
				-	unsigned ios = ACCESS_ONCE(*reserved_ios);
			
 
				-	unsigned modified_ios = 0;
			
 
				+	unsigned param = ACCESS_ONCE(*module_param);
			
 
				+	unsigned modified_param = 0;
			
 
				 
			
 
				-	if (!ios)
			
 
				-		modified_ios = def;
			
 
				-	else if (ios > max)
			
 
				-		modified_ios = max;
			
 
				+	if (!param)
			
 
				+		modified_param = def;
			
 
				+	else if (param > max)
			
 
				+		modified_param = max;
			
 
				 
			
 
				-	if (modified_ios) {
			
 
				-		(void)cmpxchg(reserved_ios, ios, modified_ios);
			
 
				-		ios = modified_ios;
			
 
				+	if (modified_param) {
			
 
				+		(void)cmpxchg(module_param, param, modified_param);
			
 
				+		param = modified_param;
			
 
				 	}
			
 
				 
			
 
				-	return ios;
			
 
				+	return param;
			
 
				 }
			
 
				 
			
 
				 unsigned dm_get_reserved_bio_based_ios(void)
			
 
				 {
			
 
				-	return __dm_get_reserved_ios(&reserved_bio_based_ios,
			
 
				+	return __dm_get_module_param(&reserved_bio_based_ios,
			
 
				 				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
			
 
				 
			
 
				 unsigned dm_get_reserved_rq_based_ios(void)
			
 
				 {
			
 
				-	return __dm_get_reserved_ios(&reserved_rq_based_ios,
			
 
				+	return __dm_get_module_param(&reserved_rq_based_ios,
			
 
				 				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
			
@@ -1017,6 +1041,11 @@ static void end_clone_bio(struct bio *clone, int error)
 
				 	blk_update_request(tio->orig, 0, nr_bytes);
			
 
				 }
			
 
				 
			
 
				+static struct dm_rq_target_io *tio_from_request(struct request *rq)
			
 
				+{
			
 
				+	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Don't touch any member of the md after calling this function because
			
 
				  * the md may be freed in dm_put() at the end of this function.
			
@@ -1024,10 +1053,13 @@ static void end_clone_bio(struct bio *clone, int error)
 
				  */
			
 
				 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
			
 
				 {
			
 
				+	int nr_requests_pending;
			
 
				+
			
 
				 	atomic_dec(&md->pending[rw]);
			
 
				 
			
 
				 	/* nudge anyone waiting on suspend queue */
			
 
				-	if (!md_in_flight(md))
			
 
				+	nr_requests_pending = md_in_flight(md);
			
 
				+	if (!nr_requests_pending)
			
 
				 		wake_up(&md->wait);
			
 
				 
			
 
				 	/*
			
@@ -1036,8 +1068,13 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 
				 	 * back into ->request_fn() could deadlock attempting to grab the
			
 
				 	 * queue lock again.
			
 
				 	 */
			
 
				-	if (run_queue)
			
 
				-		blk_run_queue_async(md->queue);
			
 
				+	if (run_queue) {
			
 
				+		if (md->queue->mq_ops)
			
 
				+			blk_mq_run_hw_queues(md->queue, true);
			
 
				+		else if (!nr_requests_pending ||
			
 
				+			 (nr_requests_pending >= md->queue->nr_congestion_on))
			
 
				+			blk_run_queue_async(md->queue);
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * dm_put() must be at the end of this function. See the comment above
			
@@ -1048,13 +1085,18 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 
				 static void free_rq_clone(struct request *clone)
			
 
				 {
			
 
				 	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+	struct mapped_device *md = tio->md;
			
 
				 
			
 
				 	blk_rq_unprep_clone(clone);
			
 
				-	if (clone->q && clone->q->mq_ops)
			
 
				+
			
 
				+	if (clone->q->mq_ops)
			
 
				 		tio->ti->type->release_clone_rq(clone);
			
 
				-	else
			
 
				-		free_clone_request(tio->md, clone);
			
 
				-	free_rq_tio(tio);
			
 
				+	else if (!md->queue->mq_ops)
			
 
				+		/* request_fn queue stacked on request_fn queue(s) */
			
 
				+		free_clone_request(md, clone);
			
 
				+
			
 
				+	if (!md->queue->mq_ops)
			
 
				+		free_rq_tio(tio);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1083,17 +1125,22 @@ static void dm_end_request(struct request *clone, int error)
 
				 	}
			
 
				 
			
 
				 	free_rq_clone(clone);
			
 
				-	blk_end_request_all(rq, error);
			
 
				+	if (!rq->q->mq_ops)
			
 
				+		blk_end_request_all(rq, error);
			
 
				+	else
			
 
				+		blk_mq_end_request(rq, error);
			
 
				 	rq_completed(md, rw, true);
			
 
				 }
			
 
				 
			
 
				 static void dm_unprep_request(struct request *rq)
			
 
				 {
			
 
				-	struct dm_rq_target_io *tio = rq->special;
			
 
				+	struct dm_rq_target_io *tio = tio_from_request(rq);
			
 
				 	struct request *clone = tio->clone;
			
 
				 
			
 
				-	rq->special = NULL;
			
 
				-	rq->cmd_flags &= ~REQ_DONTPREP;
			
 
				+	if (!rq->q->mq_ops) {
			
 
				+		rq->special = NULL;
			
 
				+		rq->cmd_flags &= ~REQ_DONTPREP;
			
 
				+	}
			
 
				 
			
 
				 	if (clone)
			
 
				 		free_rq_clone(clone);
			
@@ -1102,18 +1149,29 @@ static void dm_unprep_request(struct request *rq)
 
				 /*
			
 
				  * Requeue the original request of a clone.
			
 
				  */
			
 
				-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
			
 
				-						 struct request *rq)
			
 
				+static void old_requeue_request(struct request *rq)
			
 
				 {
			
 
				-	int rw = rq_data_dir(rq);
			
 
				 	struct request_queue *q = rq->q;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				-	dm_unprep_request(rq);
			
 
				-
			
 
				 	spin_lock_irqsave(q->queue_lock, flags);
			
 
				 	blk_requeue_request(q, rq);
			
 
				 	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
			
 
				+						 struct request *rq)
			
 
				+{
			
 
				+	int rw = rq_data_dir(rq);
			
 
				+
			
 
				+	dm_unprep_request(rq);
			
 
				+
			
 
				+	if (!rq->q->mq_ops)
			
 
				+		old_requeue_request(rq);
			
 
				+	else {
			
 
				+		blk_mq_requeue_request(rq);
			
 
				+		blk_mq_kick_requeue_list(rq->q);
			
 
				+	}
			
 
				 
			
 
				 	rq_completed(md, rw, false);
			
 
				 }
			
@@ -1125,35 +1183,44 @@ static void dm_requeue_unmapped_request(struct request *clone)
 
				 	dm_requeue_unmapped_original_request(tio->md, tio->orig);
			
 
				 }
			
 
				 
			
 
				-static void __stop_queue(struct request_queue *q)
			
 
				-{
			
 
				-	blk_stop_queue(q);
			
 
				-}
			
 
				-
			
 
				-static void stop_queue(struct request_queue *q)
			
 
				+static void old_stop_queue(struct request_queue *q)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
 
				+	if (blk_queue_stopped(q))
			
 
				+		return;
			
 
				+
			
 
				 	spin_lock_irqsave(q->queue_lock, flags);
			
 
				-	__stop_queue(q);
			
 
				+	blk_stop_queue(q);
			
 
				 	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				 }
			
 
				 
			
 
				-static void __start_queue(struct request_queue *q)
			
 
				+static void stop_queue(struct request_queue *q)
			
 
				 {
			
 
				-	if (blk_queue_stopped(q))
			
 
				-		blk_start_queue(q);
			
 
				+	if (!q->mq_ops)
			
 
				+		old_stop_queue(q);
			
 
				+	else
			
 
				+		blk_mq_stop_hw_queues(q);
			
 
				 }
			
 
				 
			
 
				-static void start_queue(struct request_queue *q)
			
 
				+static void old_start_queue(struct request_queue *q)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(q->queue_lock, flags);
			
 
				-	__start_queue(q);
			
 
				+	if (blk_queue_stopped(q))
			
 
				+		blk_start_queue(q);
			
 
				 	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				 }
			
 
				 
			
 
				+static void start_queue(struct request_queue *q)
			
 
				+{
			
 
				+	if (!q->mq_ops)
			
 
				+		old_start_queue(q);
			
 
				+	else
			
 
				+		blk_mq_start_stopped_hw_queues(q, true);
			
 
				+}
			
 
				+
			
 
				 static void dm_done(struct request *clone, int error, bool mapped)
			
 
				 {
			
 
				 	int r = error;
			
@@ -1192,13 +1259,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
 
				 static void dm_softirq_done(struct request *rq)
			
 
				 {
			
 
				 	bool mapped = true;
			
 
				-	struct dm_rq_target_io *tio = rq->special;
			
 
				+	struct dm_rq_target_io *tio = tio_from_request(rq);
			
 
				 	struct request *clone = tio->clone;
			
 
				+	int rw;
			
 
				 
			
 
				 	if (!clone) {
			
 
				-		blk_end_request_all(rq, tio->error);
			
 
				-		rq_completed(tio->md, rq_data_dir(rq), false);
			
 
				-		free_rq_tio(tio);
			
 
				+		rw = rq_data_dir(rq);
			
 
				+		if (!rq->q->mq_ops) {
			
 
				+			blk_end_request_all(rq, tio->error);
			
 
				+			rq_completed(tio->md, rw, false);
			
 
				+			free_rq_tio(tio);
			
 
				+		} else {
			
 
				+			blk_mq_end_request(rq, tio->error);
			
 
				+			rq_completed(tio->md, rw, false);
			
 
				+		}
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -1214,7 +1288,7 @@ static void dm_softirq_done(struct request *rq)
 
				  */
			
 
				 static void dm_complete_request(struct request *rq, int error)
			
 
				 {
			
 
				-	struct dm_rq_target_io *tio = rq->special;
			
 
				+	struct dm_rq_target_io *tio = tio_from_request(rq);
			
 
				 
			
 
				 	tio->error = error;
			
 
				 	blk_complete_request(rq);
			
@@ -1233,7 +1307,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Called with the clone's queue lock held
			
 
				+ * Called with the clone's queue lock held (for non-blk-mq)
			
 
				  */
			
 
				 static void end_clone_request(struct request *clone, int error)
			
 
				 {
			
@@ -1693,7 +1767,7 @@ out:
 
				  * The request function that just remaps the bio built up by
			
 
				  * dm_merge_bvec.
			
 
				  */
			
 
				-static void _dm_request(struct request_queue *q, struct bio *bio)
			
 
				+static void dm_make_request(struct request_queue *q, struct bio *bio)
			
 
				 {
			
 
				 	int rw = bio_data_dir(bio);
			
 
				 	struct mapped_device *md = q->queuedata;
			
@@ -1725,16 +1799,6 @@ int dm_request_based(struct mapped_device *md)
 
				 	return blk_queue_stackable(md->queue);
			
 
				 }
			
 
				 
			
 
				-static void dm_request(struct request_queue *q, struct bio *bio)
			
 
				-{
			
 
				-	struct mapped_device *md = q->queuedata;
			
 
				-
			
 
				-	if (dm_request_based(md))
			
 
				-		blk_queue_bio(q, bio);
			
 
				-	else
			
 
				-		_dm_request(q, bio);
			
 
				-}
			
 
				-
			
 
				 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
			
 
				 {
			
 
				 	int r;
			
@@ -1787,15 +1851,25 @@ static int setup_clone(struct request *clone, struct request *rq,
 
				 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
			
 
				 				struct dm_rq_target_io *tio, gfp_t gfp_mask)
			
 
				 {
			
 
				-	struct request *clone = alloc_clone_request(md, gfp_mask);
			
 
				+	/*
			
 
				+	 * Do not allocate a clone if tio->clone was already set
			
 
				+	 * (see: dm_mq_queue_rq).
			
 
				+	 */
			
 
				+	bool alloc_clone = !tio->clone;
			
 
				+	struct request *clone;
			
 
				 
			
 
				-	if (!clone)
			
 
				-		return NULL;
			
 
				+	if (alloc_clone) {
			
 
				+		clone = alloc_clone_request(md, gfp_mask);
			
 
				+		if (!clone)
			
 
				+			return NULL;
			
 
				+	} else
			
 
				+		clone = tio->clone;
			
 
				 
			
 
				 	blk_rq_init(NULL, clone);
			
 
				 	if (setup_clone(clone, rq, tio, gfp_mask)) {
			
 
				 		/* -ENOMEM */
			
 
				-		free_clone_request(md, clone);
			
 
				+		if (alloc_clone)
			
 
				+			free_clone_request(md, clone);
			
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
@@ -1804,6 +1878,19 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 
				 
			
 
				 static void map_tio_request(struct kthread_work *work);
			
 
				 
			
 
				+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
			
 
				+		     struct mapped_device *md)
			
 
				+{
			
 
				+	tio->md = md;
			
 
				+	tio->ti = NULL;
			
 
				+	tio->clone = NULL;
			
 
				+	tio->orig = rq;
			
 
				+	tio->error = 0;
			
 
				+	memset(&tio->info, 0, sizeof(tio->info));
			
 
				+	if (md->kworker_task)
			
 
				+		init_kthread_work(&tio->work, map_tio_request);
			
 
				+}
			
 
				+
			
 
				 static struct dm_rq_target_io *prep_tio(struct request *rq,
			
 
				 					struct mapped_device *md, gfp_t gfp_mask)
			
 
				 {
			
@@ -1815,13 +1902,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 
				 	if (!tio)
			
 
				 		return NULL;
			
 
				 
			
 
				-	tio->md = md;
			
 
				-	tio->ti = NULL;
			
 
				-	tio->clone = NULL;
			
 
				-	tio->orig = rq;
			
 
				-	tio->error = 0;
			
 
				-	memset(&tio->info, 0, sizeof(tio->info));
			
 
				-	init_kthread_work(&tio->work, map_tio_request);
			
 
				+	init_tio(tio, rq, md);
			
 
				 
			
 
				 	table = dm_get_live_table(md, &srcu_idx);
			
 
				 	if (!dm_table_mq_request_based(table)) {
			
@@ -1865,11 +1946,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 
				  * DM_MAPIO_REQUEUE : the original request needs to be requeued
			
 
				  * < 0              : the request was completed due to failure
			
 
				  */
			
 
				-static int map_request(struct dm_target *ti, struct request *rq,
			
 
				+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
			
 
				 		       struct mapped_device *md)
			
 
				 {
			
 
				 	int r;
			
 
				-	struct dm_rq_target_io *tio = rq->special;
			
 
				+	struct dm_target *ti = tio->ti;
			
 
				 	struct request *clone = NULL;
			
 
				 
			
 
				 	if (tio->clone) {
			
@@ -1884,7 +1965,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
 
				 		}
			
 
				 		if (IS_ERR(clone))
			
 
				 			return DM_MAPIO_REQUEUE;
			
 
				-		if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
			
 
				+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
			
 
				 			/* -ENOMEM */
			
 
				 			ti->type->release_clone_rq(clone);
			
 
				 			return DM_MAPIO_REQUEUE;
			
@@ -1925,15 +2006,24 @@ static void map_tio_request(struct kthread_work *work)
 
				 	struct request *rq = tio->orig;
			
 
				 	struct mapped_device *md = tio->md;
			
 
				 
			
 
				-	if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
			
 
				+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
			
 
				 		dm_requeue_unmapped_original_request(md, rq);
			
 
				 }
			
 
				 
			
 
				 static void dm_start_request(struct mapped_device *md, struct request *orig)
			
 
				 {
			
 
				-	blk_start_request(orig);
			
 
				+	if (!orig->q->mq_ops)
			
 
				+		blk_start_request(orig);
			
 
				+	else
			
 
				+		blk_mq_start_request(orig);
			
 
				 	atomic_inc(&md->pending[rq_data_dir(orig)]);
			
 
				 
			
 
				+	if (md->seq_rq_merge_deadline_usecs) {
			
 
				+		md->last_rq_pos = rq_end_sector(orig);
			
 
				+		md->last_rq_rw = rq_data_dir(orig);
			
 
				+		md->last_rq_start_time = ktime_get();
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Hold the md reference here for the in-flight I/O.
			
 
				 	 * We can't rely on the reference count by device opener,
			
@@ -1944,6 +2034,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 
				 	dm_get(md);
			
 
				 }
			
 
				 
			
 
				+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
			
 
				+
			
 
				+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
			
 
				+}
			
 
				+
			
 
				+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
			
 
				+						     const char *buf, size_t count)
			
 
				+{
			
 
				+	unsigned deadline;
			
 
				+
			
 
				+	if (!dm_request_based(md) || md->use_blk_mq)
			
 
				+		return count;
			
 
				+
			
 
				+	if (kstrtouint(buf, 10, &deadline))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
			
 
				+		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
			
 
				+
			
 
				+	md->seq_rq_merge_deadline_usecs = deadline;
			
 
				+
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
			
 
				+{
			
 
				+	ktime_t kt_deadline;
			
 
				+
			
 
				+	if (!md->seq_rq_merge_deadline_usecs)
			
 
				+		return false;
			
 
				+
			
 
				+	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
			
 
				+	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
			
 
				+
			
 
				+	return !ktime_after(ktime_get(), kt_deadline);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * q->request_fn for request-based dm.
			
 
				  * Called with the queue lock held.
			
@@ -1967,7 +2096,7 @@ static void dm_request_fn(struct request_queue *q)
 
				 	while (!blk_queue_stopped(q)) {
			
 
				 		rq = blk_peek_request(q);
			
 
				 		if (!rq)
			
 
				-			goto delay_and_out;
			
 
				+			goto out;
			
 
				 
			
 
				 		/* always use block 0 to find the target for flushes for now */
			
 
				 		pos = 0;
			
@@ -1986,12 +2115,17 @@ static void dm_request_fn(struct request_queue *q)
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				+		if (dm_request_peeked_before_merge_deadline(md) &&
			
 
				+		    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
			
 
				+		    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
			
 
				+			goto delay_and_out;
			
 
				+
			
 
				 		if (ti->type->busy && ti->type->busy(ti))
			
 
				 			goto delay_and_out;
			
 
				 
			
 
				 		dm_start_request(md, rq);
			
 
				 
			
 
				-		tio = rq->special;
			
 
				+		tio = tio_from_request(rq);
			
 
				 		/* Establish tio->ti before queuing work (map_tio_request) */
			
 
				 		tio->ti = ti;
			
 
				 		queue_kthread_work(&md->kworker, &tio->work);
			
@@ -2001,33 +2135,11 @@ static void dm_request_fn(struct request_queue *q)
 
				 	goto out;
			
 
				 
			
 
				 delay_and_out:
			
 
				-	blk_delay_queue(q, HZ / 10);
			
 
				+	blk_delay_queue(q, HZ / 100);
			
 
				 out:
			
 
				 	dm_put_live_table(md, srcu_idx);
			
 
				 }
			
 
				 
			
 
				-int dm_underlying_device_busy(struct request_queue *q)
			
 
				-{
			
 
				-	return blk_lld_busy(q);
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
			
 
				-
			
 
				-static int dm_lld_busy(struct request_queue *q)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct mapped_device *md = q->queuedata;
			
 
				-	struct dm_table *map = dm_get_live_table_fast(md);
			
 
				-
			
 
				-	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
			
 
				-		r = 1;
			
 
				-	else
			
 
				-		r = dm_table_any_busy_target(map);
			
 
				-
			
 
				-	dm_put_live_table_fast(md);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				 static int dm_any_congested(void *congested_data, int bdi_bits)
			
 
				 {
			
 
				 	int r = bdi_bits;
			
@@ -2110,7 +2222,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 
				 {
			
 
				 	/*
			
 
				 	 * Request-based dm devices cannot be stacked on top of bio-based dm
			
 
				-	 * devices.  The type of this dm device has not been decided yet.
			
 
				+	 * devices.  The type of this dm device may not have been decided yet.
			
 
				 	 * The type is decided at the first table loading time.
			
 
				 	 * To prevent problematic device stacking, clear the queue flag
			
 
				 	 * for request stacking support until then.
			
@@ -2118,13 +2230,21 @@ static void dm_init_md_queue(struct mapped_device *md)
 
				 	 * This queue is new, so no concurrency on the queue_flags.
			
 
				 	 */
			
 
				 	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
			
 
				+}
			
 
				+
			
 
				+static void dm_init_old_md_queue(struct mapped_device *md)
			
 
				+{
			
 
				+	md->use_blk_mq = false;
			
 
				+	dm_init_md_queue(md);
			
 
				 
			
 
				+	/*
			
 
				+	 * Initialize aspects of queue that aren't relevant for blk-mq
			
 
				+	 */
			
 
				 	md->queue->queuedata = md;
			
 
				 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
			
 
				 	md->queue->backing_dev_info.congested_data = md;
			
 
				-	blk_queue_make_request(md->queue, dm_request);
			
 
				+
			
 
				 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
			
 
				-	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2156,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
 
				 	if (r < 0)
			
 
				 		goto bad_io_barrier;
			
 
				 
			
 
				+	md->use_blk_mq = use_blk_mq;
			
 
				 	md->type = DM_TYPE_NONE;
			
 
				 	mutex_init(&md->suspend_lock);
			
 
				 	mutex_init(&md->type_lock);
			
@@ -2267,6 +2388,8 @@ static void free_dev(struct mapped_device *md)
 
				 	del_gendisk(md->disk);
			
 
				 	put_disk(md->disk);
			
 
				 	blk_cleanup_queue(md->queue);
			
 
				+	if (md->use_blk_mq)
			
 
				+		blk_mq_free_tag_set(&md->tag_set);
			
 
				 	bdput(md->bdev);
			
 
				 	free_minor(minor);
			
 
				 
			
@@ -2278,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 
				 {
			
 
				 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
			
 
				 
			
 
				-	if (md->io_pool && md->bs) {
			
 
				+	if (md->bs) {
			
 
				 		/* The md already has necessary mempools. */
			
 
				 		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
			
 
				 			/*
			
@@ -2310,7 +2433,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 
				 	p->bs = NULL;
			
 
				 
			
 
				 out:
			
 
				-	/* mempool bind completed, now no need any mempools in the table */
			
 
				+	/* mempool bind completed, no longer need any mempools in the table */
			
 
				 	dm_table_free_md_mempools(t);
			
 
				 }
			
 
				 
			
@@ -2357,7 +2480,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
 
				 	if (!q->merge_bvec_fn)
			
 
				 		return 0;
			
 
				 
			
 
				-	if (q->make_request_fn == dm_request) {
			
 
				+	if (q->make_request_fn == dm_make_request) {
			
 
				 		dev_md = q->queuedata;
			
 
				 		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
			
 
				 			return 0;
			
@@ -2426,7 +2549,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
				 	 * This must be done before setting the queue restrictions,
			
 
				 	 * because request-based dm may be run just after the setting.
			
 
				 	 */
			
 
				-	if (dm_table_request_based(t) && !blk_queue_stopped(q))
			
 
				+	if (dm_table_request_based(t))
			
 
				 		stop_queue(q);
			
 
				 
			
 
				 	__bind_mempools(md, t);
			
@@ -2508,14 +2631,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
 
				 	return md->type;
			
 
				 }
			
 
				 
			
 
				-static bool dm_md_type_request_based(struct mapped_device *md)
			
 
				-{
			
 
				-	unsigned table_type = dm_get_md_type(md);
			
 
				-
			
 
				-	return (table_type == DM_TYPE_REQUEST_BASED ||
			
 
				-		table_type == DM_TYPE_MQ_REQUEST_BASED);
			
 
				-}
			
 
				-
			
 
				 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
			
 
				 {
			
 
				 	return md->immutable_target_type;
			
@@ -2532,6 +2647,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
			
 
				 
			
 
				+static void init_rq_based_worker_thread(struct mapped_device *md)
			
 
				+{
			
 
				+	/* Initialize the request-based DM worker thread */
			
 
				+	init_kthread_worker(&md->kworker);
			
 
				+	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
			
 
				+				       "kdmwork-%s", dm_device_name(md));
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
			
 
				  */
			
@@ -2540,27 +2663,160 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 
				 	struct request_queue *q = NULL;
			
 
				 
			
 
				 	if (md->queue->elevator)
			
 
				-		return 1;
			
 
				+		return 0;
			
 
				 
			
 
				 	/* Fully initialize the queue */
			
 
				 	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
			
 
				 	if (!q)
			
 
				-		return 0;
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* disable dm_request_fn's merge heuristic by default */
			
 
				+	md->seq_rq_merge_deadline_usecs = 0;
			
 
				 
			
 
				 	md->queue = q;
			
 
				-	dm_init_md_queue(md);
			
 
				+	dm_init_old_md_queue(md);
			
 
				 	blk_queue_softirq_done(md->queue, dm_softirq_done);
			
 
				 	blk_queue_prep_rq(md->queue, dm_prep_fn);
			
 
				-	blk_queue_lld_busy(md->queue, dm_lld_busy);
			
 
				 
			
 
				-	/* Also initialize the request-based DM worker thread */
			
 
				-	init_kthread_worker(&md->kworker);
			
 
				-	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
			
 
				-				       "kdmwork-%s", dm_device_name(md));
			
 
				+	init_rq_based_worker_thread(md);
			
 
				 
			
 
				 	elv_register_queue(md->queue);
			
 
				 
			
 
				-	return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int dm_mq_init_request(void *data, struct request *rq,
			
 
				+			      unsigned int hctx_idx, unsigned int request_idx,
			
 
				+			      unsigned int numa_node)
			
 
				+{
			
 
				+	struct mapped_device *md = data;
			
 
				+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
			
 
				+
			
 
				+	/*
			
 
				+	 * Must initialize md member of tio, otherwise it won't
			
 
				+	 * be available in dm_mq_queue_rq.
			
 
				+	 */
			
 
				+	tio->md = md;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
			
 
				+			  const struct blk_mq_queue_data *bd)
			
 
				+{
			
 
				+	struct request *rq = bd->rq;
			
 
				+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
			
 
				+	struct mapped_device *md = tio->md;
			
 
				+	int srcu_idx;
			
 
				+	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
			
 
				+	struct dm_target *ti;
			
 
				+	sector_t pos;
			
 
				+
			
 
				+	/* always use block 0 to find the target for flushes for now */
			
 
				+	pos = 0;
			
 
				+	if (!(rq->cmd_flags & REQ_FLUSH))
			
 
				+		pos = blk_rq_pos(rq);
			
 
				+
			
 
				+	ti = dm_table_find_target(map, pos);
			
 
				+	if (!dm_target_is_valid(ti)) {
			
 
				+		dm_put_live_table(md, srcu_idx);
			
 
				+		DMERR_LIMIT("request attempted access beyond the end of device");
			
 
				+		/*
			
 
				+		 * Must perform setup, that rq_completed() requires,
			
 
				+		 * before returning BLK_MQ_RQ_QUEUE_ERROR
			
 
				+		 */
			
 
				+		dm_start_request(md, rq);
			
 
				+		return BLK_MQ_RQ_QUEUE_ERROR;
			
 
				+	}
			
 
				+	dm_put_live_table(md, srcu_idx);
			
 
				+
			
 
				+	if (ti->type->busy && ti->type->busy(ti))
			
 
				+		return BLK_MQ_RQ_QUEUE_BUSY;
			
 
				+
			
 
				+	dm_start_request(md, rq);
			
 
				+
			
 
				+	/* Init tio using md established in .init_request */
			
 
				+	init_tio(tio, rq, md);
			
 
				+
			
 
				+	/*
			
 
				+	 * Establish tio->ti before queuing work (map_tio_request)
			
 
				+	 * or making direct call to map_request().
			
 
				+	 */
			
 
				+	tio->ti = ti;
			
 
				+
			
 
				+	/* Clone the request if underlying devices aren't blk-mq */
			
 
				+	if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
			
 
				+		/* clone request is allocated at the end of the pdu */
			
 
				+		tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
			
 
				+		if (!clone_rq(rq, md, tio, GFP_ATOMIC))
			
 
				+			return BLK_MQ_RQ_QUEUE_BUSY;
			
 
				+		queue_kthread_work(&md->kworker, &tio->work);
			
 
				+	} else {
			
 
				+		/* Direct call is fine since .queue_rq allows allocations */
			
 
				+		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
			
 
				+			dm_requeue_unmapped_original_request(md, rq);
			
 
				+	}
			
 
				+
			
 
				+	return BLK_MQ_RQ_QUEUE_OK;
			
 
				+}
			
 
				+
			
 
				+static struct blk_mq_ops dm_mq_ops = {
			
 
				+	.queue_rq = dm_mq_queue_rq,
			
 
				+	.map_queue = blk_mq_map_queue,
			
 
				+	.complete = dm_softirq_done,
			
 
				+	.init_request = dm_mq_init_request,
			
 
				+};
			
 
				+
			
 
				+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
			
 
				+{
			
 
				+	unsigned md_type = dm_get_md_type(md);
			
 
				+	struct request_queue *q;
			
 
				+	int err;
			
 
				+
			
 
				+	memset(&md->tag_set, 0, sizeof(md->tag_set));
			
 
				+	md->tag_set.ops = &dm_mq_ops;
			
 
				+	md->tag_set.queue_depth = BLKDEV_MAX_RQ;
			
 
				+	md->tag_set.numa_node = NUMA_NO_NODE;
			
 
				+	md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
			
 
				+	md->tag_set.nr_hw_queues = 1;
			
 
				+	if (md_type == DM_TYPE_REQUEST_BASED) {
			
 
				+		/* make the memory for non-blk-mq clone part of the pdu */
			
 
				+		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
			
 
				+	} else
			
 
				+		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
			
 
				+	md->tag_set.driver_data = md;
			
 
				+
			
 
				+	err = blk_mq_alloc_tag_set(&md->tag_set);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
			
 
				+	if (IS_ERR(q)) {
			
 
				+		err = PTR_ERR(q);
			
 
				+		goto out_tag_set;
			
 
				+	}
			
 
				+	md->queue = q;
			
 
				+	dm_init_md_queue(md);
			
 
				+
			
 
				+	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
			
 
				+	blk_mq_register_disk(md->disk);
			
 
				+
			
 
				+	if (md_type == DM_TYPE_REQUEST_BASED)
			
 
				+		init_rq_based_worker_thread(md);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out_tag_set:
			
 
				+	blk_mq_free_tag_set(&md->tag_set);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
			
 
				+{
			
 
				+	if (type == DM_TYPE_BIO_BASED)
			
 
				+		return type;
			
 
				+
			
 
				+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2568,9 +2824,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 
				  */
			
 
				 int dm_setup_md_queue(struct mapped_device *md)
			
 
				 {
			
 
				-	if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
			
 
				-		DMWARN("Cannot initialize queue for request-based mapped device");
			
 
				-		return -EINVAL;
			
 
				+	int r;
			
 
				+	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
			
 
				+
			
 
				+	switch (md_type) {
			
 
				+	case DM_TYPE_REQUEST_BASED:
			
 
				+		r = dm_init_request_based_queue(md);
			
 
				+		if (r) {
			
 
				+			DMWARN("Cannot initialize queue for request-based mapped device");
			
 
				+			return r;
			
 
				+		}
			
 
				+		break;
			
 
				+	case DM_TYPE_MQ_REQUEST_BASED:
			
 
				+		r = dm_init_request_based_blk_mq_queue(md);
			
 
				+		if (r) {
			
 
				+			DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
			
 
				+			return r;
			
 
				+		}
			
 
				+		break;
			
 
				+	case DM_TYPE_BIO_BASED:
			
 
				+		dm_init_old_md_queue(md);
			
 
				+		blk_queue_make_request(md->queue, dm_make_request);
			
 
				+		blk_queue_merge_bvec(md->queue, dm_merge_bvec);
			
 
				+		break;
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -2654,7 +2930,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 
				 	set_bit(DMF_FREEING, &md->flags);
			
 
				 	spin_unlock(&_minor_lock);
			
 
				 
			
 
				-	if (dm_request_based(md))
			
 
				+	if (dm_request_based(md) && md->kworker_task)
			
 
				 		flush_kthread_worker(&md->kworker);
			
 
				 
			
 
				 	/*
			
@@ -2908,7 +3184,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 
				 	 */
			
 
				 	if (dm_request_based(md)) {
			
 
				 		stop_queue(md->queue);
			
 
				-		flush_kthread_worker(&md->kworker);
			
 
				+		if (md->kworker_task)
			
 
				+			flush_kthread_worker(&md->kworker);
			
 
				 	}
			
 
				 
			
 
				 	flush_workqueue(md->wq);
			
@@ -3206,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
 
				 {
			
 
				 	return md->disk;
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(dm_disk);
			
 
				 
			
 
				 struct kobject *dm_kobject(struct mapped_device *md)
			
 
				 {
			
@@ -3253,16 +3531,19 @@ int dm_noflush_suspending(struct dm_target *ti)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
			
 
				 
			
 
				-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
			
 
				+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
			
 
				+					    unsigned integrity, unsigned per_bio_data_size)
			
 
				 {
			
 
				 	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
			
 
				-	struct kmem_cache *cachep;
			
 
				+	struct kmem_cache *cachep = NULL;
			
 
				 	unsigned int pool_size = 0;
			
 
				 	unsigned int front_pad;
			
 
				 
			
 
				 	if (!pools)
			
 
				 		return NULL;
			
 
				 
			
 
				+	type = filter_md_type(type, md);
			
 
				+
			
 
				 	switch (type) {
			
 
				 	case DM_TYPE_BIO_BASED:
			
 
				 		cachep = _io_cache;
			
@@ -3270,13 +3551,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
				 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
			
 
				 		break;
			
 
				 	case DM_TYPE_REQUEST_BASED:
			
 
				+		cachep = _rq_tio_cache;
			
 
				 		pool_size = dm_get_reserved_rq_based_ios();
			
 
				 		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
			
 
				 		if (!pools->rq_pool)
			
 
				 			goto out;
			
 
				 		/* fall through to setup remaining rq-based pools */
			
 
				 	case DM_TYPE_MQ_REQUEST_BASED:
			
 
				-		cachep = _rq_tio_cache;
			
 
				 		if (!pool_size)
			
 
				 			pool_size = dm_get_reserved_rq_based_ios();
			
 
				 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
			
@@ -3284,12 +3565,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 
				 		WARN_ON(per_bio_data_size != 0);
			
 
				 		break;
			
 
				 	default:
			
 
				-		goto out;
			
 
				+		BUG();
			
 
				 	}
			
 
				 
			
 
				-	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
			
 
				-	if (!pools->io_pool)
			
 
				-		goto out;
			
 
				+	if (cachep) {
			
 
				+		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
			
 
				+		if (!pools->io_pool)
			
 
				+			goto out;
			
 
				+	}
			
 
				 
			
 
				 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
			
 
				 	if (!pools->bs)
			
@@ -3346,6 +3629,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 
				 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
			
 
				 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
			
 
				 
			
 
				+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
			
 
				+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
			
 
				+
			
 
				 MODULE_DESCRIPTION(DM_NAME " driver");
			
 
				 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
			
 
				 MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 
				 void dm_table_postsuspend_targets(struct dm_table *t);
			
 
				 int dm_table_resume_targets(struct dm_table *t);
			
 
				 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
			
 
				-int dm_table_any_busy_target(struct dm_table *t);
			
 
				 unsigned dm_table_get_type(struct dm_table *t);
			
 
				 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
			
 
				 bool dm_table_request_based(struct dm_table *t);
			
@@ -212,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 
				 void dm_internal_suspend(struct mapped_device *md);
			
 
				 void dm_internal_resume(struct mapped_device *md);
			
 
				 
			
 
				+bool dm_use_blk_mq(struct mapped_device *md);
			
 
				+
			
 
				 int dm_io_init(void);
			
 
				 void dm_io_exit(void);
			
 
				 
			
@@ -221,7 +222,8 @@ void dm_kcopyd_exit(void);
 
				 /*
			
 
				  * Mempool operations
			
 
				  */
			
 
				-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
			
 
				+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
			
 
				+					    unsigned integrity, unsigned per_bio_data_size);
			
 
				 void dm_free_md_mempools(struct dm_md_mempools *pools);
			
 
				 
			
 
				 /*
			
@@ -235,4 +237,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
 
				 	return !maxlen || strlen(result) + 1 >= maxlen;
			
 
				 }
			
 
				 
			
 
				+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
			
 
				+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
			
 
				+						     const char *buf, size_t count);
			
 
				+
			
 
				 #endif
			
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -605,9 +605,4 @@ static inline unsigned long to_bytes(sector_t n)
 
				 	return (n << SECTOR_SHIFT);
			
 
				 }
			
 
				 
			
 
				-/*-----------------------------------------------------------------
			
 
				- * Helper for block layer and dm core operations
			
 
				- *---------------------------------------------------------------*/
			
 
				-int dm_underlying_device_busy(struct request_queue *q);
			
 
				-
			
 
				 #endif	/* _LINUX_DEVICE_MAPPER_H */
			
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,6 +92,6 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 
				 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
			
 
				 extern void mark_mounts_for_expiry(struct list_head *mounts);
			
 
				 
			
 
				-extern dev_t name_to_dev_t(char *name);
			
 
				+extern dev_t name_to_dev_t(const char *name);
			
 
				 
			
 
				 #endif /* _LINUX_MOUNT_H */
			
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 
				 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
			
 
				 
			
 
				 #define DM_VERSION_MAJOR	4
			
 
				-#define DM_VERSION_MINOR	30
			
 
				+#define DM_VERSION_MINOR	31
			
 
				 #define DM_VERSION_PATCHLEVEL	0
			
 
				-#define DM_VERSION_EXTRA	"-ioctl (2014-12-22)"
			
 
				+#define DM_VERSION_EXTRA	"-ioctl (2015-3-12)"
			
 
				 
			
 
				 /* Status bits */
			
 
				 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
			
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -207,7 +207,7 @@ done:
 
				  *	bangs.
			
 
				  */
			
 
				 
			
 
				-dev_t name_to_dev_t(char *name)
			
 
				+dev_t name_to_dev_t(const char *name)
			
 
				 {
			
 
				 	char s[32];
			
 
				 	char *p;
			
@@ -226,8 +226,9 @@ dev_t name_to_dev_t(char *name)
 
				 
			
 
				 	if (strncmp(name, "/dev/", 5) != 0) {
			
 
				 		unsigned maj, min;
			
 
				+		char dummy;
			
 
				 
			
 
				-		if (sscanf(name, "%u:%u", &maj, &min) == 2) {
			
 
				+		if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) {
			
 
				 			res = MKDEV(maj, min);
			
 
				 			if (maj != MAJOR(res) || min != MINOR(res))
			
 
				 				goto fail;
			
@@ -286,6 +287,7 @@ fail:
 
				 done:
			
 
				 	return res;
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(name_to_dev_t);
			
 
				 
			
 
				 static int __init root_dev_setup(char *line)
			
 
				 {