8 years ago · d35a878ae1
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
 
				 	      <offset> [<#opt_params> <opt_params>]
			
 
				 
			
 
				 <cipher>
			
 
				-    Encryption cipher and an optional IV generation mode.
			
 
				-    (In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
			
 
				+    Encryption cipher, encryption mode and Initial Vector (IV) generator.
			
 
				+
			
 
				+    The cipher specifications format is:
			
 
				+       cipher[:keycount]-chainmode-ivmode[:ivopts]
			
 
				     Examples:
			
 
				-       des
			
 
				        aes-cbc-essiv:sha256
			
 
				-       twofish-ecb
			
 
				+       aes-xts-plain64
			
 
				+       serpent-xts-plain64
			
 
				+
			
 
				+    Cipher format also supports direct specification with kernel crypt API
			
 
				+    format (selected by capi: prefix). The IV specification is the same
			
 
				+    as for the first format type.
			
 
				+    This format is mainly used for specification of authenticated modes.
			
 
				 
			
 
				-    /proc/crypto contains supported crypto modes
			
 
				+    The crypto API cipher specifications format is:
			
 
				+        capi:cipher_api_spec-ivmode[:ivopts]
			
 
				+    Examples:
			
 
				+        capi:cbc(aes)-essiv:sha256
			
 
				+        capi:xts(aes)-plain64
			
 
				+    Examples of authenticated modes:
			
 
				+        capi:gcm(aes)-random
			
 
				+        capi:authenc(hmac(sha256),xts(aes))-random
			
 
				+        capi:rfc7539(chacha20,poly1305)-random
			
 
				+
			
 
				+    The /proc/crypto contains a list of curently loaded crypto modes.
			
 
				 
			
 
				 <key>
			
 
				     Key used for encryption. It is encoded either as a hexadecimal number
			
@@ -93,6 +110,32 @@ submit_from_crypt_cpus
 
				     thread because it benefits CFQ to have writes submitted using the
			
 
				     same context.
			
 
				 
			
 
				+integrity:<bytes>:<type>
			
 
				+    The device requires additional <bytes> metadata per-sector stored
			
 
				+    in per-bio integrity structure. This metadata must by provided
			
 
				+    by underlying dm-integrity target.
			
 
				+
			
 
				+    The <type> can be "none" if metadata is used only for persistent IV.
			
 
				+
			
 
				+    For Authenticated Encryption with Additional Data (AEAD)
			
 
				+    the <type> is "aead". An AEAD mode additionally calculates and verifies
			
 
				+    integrity for the encrypted device. The additional space is then
			
 
				+    used for storing authentication tag (and persistent IV if needed).
			
 
				+
			
 
				+sector_size:<bytes>
			
 
				+    Use <bytes> as the encryption unit instead of 512 bytes sectors.
			
 
				+    This option can be in range 512 - 4096 bytes and must be power of two.
			
 
				+    Virtual device will announce this size as a minimal IO and logical sector.
			
 
				+
			
 
				+iv_large_sectors
			
 
				+   IV generators will use sector number counted in <sector_size> units
			
 
				+   instead of default 512 bytes sectors.
			
 
				+
			
 
				+   For example, if <sector_size> is 4096 bytes, plain64 IV for the second
			
 
				+   sector will be 8 (without flag) and 1 if iv_large_sectors is present.
			
 
				+   The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
			
 
				+   if this flag is specified.
			
 
				+
			
 
				 Example scripts
			
 
				 ===============
			
 
				 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
			
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -0,0 +1,199 @@
 
				+The dm-integrity target emulates a block device that has additional
			
 
				+per-sector tags that can be used for storing integrity information.
			
 
				+
			
 
				+A general problem with storing integrity tags with every sector is that
			
 
				+writing the sector and the integrity tag must be atomic - i.e. in case of
			
 
				+crash, either both sector and integrity tag or none of them is written.
			
 
				+
			
 
				+To guarantee write atomicity, the dm-integrity target uses journal, it
			
 
				+writes sector data and integrity tags into a journal, commits the journal
			
 
				+and then copies the data and integrity tags to their respective location.
			
 
				+
			
 
				+The dm-integrity target can be used with the dm-crypt target - in this
			
 
				+situation the dm-crypt target creates the integrity data and passes them
			
 
				+to the dm-integrity target via bio_integrity_payload attached to the bio.
			
 
				+In this mode, the dm-crypt and dm-integrity targets provide authenticated
			
 
				+disk encryption - if the attacker modifies the encrypted device, an I/O
			
 
				+error is returned instead of random data.
			
 
				+
			
 
				+The dm-integrity target can also be used as a standalone target, in this
			
 
				+mode it calculates and verifies the integrity tag internally. In this
			
 
				+mode, the dm-integrity target can be used to detect silent data
			
 
				+corruption on the disk or in the I/O path.
			
 
				+
			
 
				+
			
 
				+When loading the target for the first time, the kernel driver will format
			
 
				+the device. But it will only format the device if the superblock contains
			
 
				+zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
			
 
				+target can't be loaded.
			
 
				+
			
 
				+To use the target for the first time:
			
 
				+1. overwrite the superblock with zeroes
			
 
				+2. load the dm-integrity target with one-sector size, the kernel driver
			
 
				+	will format the device
			
 
				+3. unload the dm-integrity target
			
 
				+4. read the "provided_data_sectors" value from the superblock
			
 
				+5. load the dm-integrity target with the the target size
			
 
				+	"provided_data_sectors"
			
 
				+6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
			
 
				+	with the size "provided_data_sectors"
			
 
				+
			
 
				+
			
 
				+Target arguments:
			
 
				+
			
 
				+1. the underlying block device
			
 
				+
			
 
				+2. the number of reserved sector at the beginning of the device - the
			
 
				+	dm-integrity won't read of write these sectors
			
 
				+
			
 
				+3. the size of the integrity tag (if "-" is used, the size is taken from
			
 
				+	the internal-hash algorithm)
			
 
				+
			
 
				+4. mode:
			
 
				+	D - direct writes (without journal) - in this mode, journaling is
			
 
				+		not used and data sectors and integrity tags are written
			
 
				+		separately. In case of crash, it is possible that the data
			
 
				+		and integrity tag doesn't match.
			
 
				+	J - journaled writes - data and integrity tags are written to the
			
 
				+		journal and atomicity is guaranteed. In case of crash,
			
 
				+		either both data and tag or none of them are written. The
			
 
				+		journaled mode degrades write throughput twice because the
			
 
				+		data have to be written twice.
			
 
				+	R - recovery mode - in this mode, journal is not replayed,
			
 
				+		checksums are not checked and writes to the device are not
			
 
				+		allowed. This mode is useful for data recovery if the
			
 
				+		device cannot be activated in any of the other standard
			
 
				+		modes.
			
 
				+
			
 
				+5. the number of additional arguments
			
 
				+
			
 
				+Additional arguments:
			
 
				+
			
 
				+journal_sectors:number
			
 
				+	The size of journal, this argument is used only if formatting the
			
 
				+	device. If the device is already formatted, the value from the
			
 
				+	superblock is used.
			
 
				+
			
 
				+interleave_sectors:number
			
 
				+	The number of interleaved sectors. This values is rounded down to
			
 
				+	a power of two. If the device is already formatted, the value from
			
 
				+	the superblock is used.
			
 
				+
			
 
				+buffer_sectors:number
			
 
				+	The number of sectors in one buffer. The value is rounded down to
			
 
				+	a power of two.
			
 
				+
			
 
				+	The tag area is accessed using buffers, the buffer size is
			
 
				+	configurable. The large buffer size means that the I/O size will
			
 
				+	be larger, but there could be less I/Os issued.
			
 
				+
			
 
				+journal_watermark:number
			
 
				+	The journal watermark in percents. When the size of the journal
			
 
				+	exceeds this watermark, the thread that flushes the journal will
			
 
				+	be started.
			
 
				+
			
 
				+commit_time:number
			
 
				+	Commit time in milliseconds. When this time passes, the journal is
			
 
				+	written. The journal is also written immediatelly if the FLUSH
			
 
				+	request is received.
			
 
				+
			
 
				+internal_hash:algorithm(:key)	(the key is optional)
			
 
				+	Use internal hash or crc.
			
 
				+	When this argument is used, the dm-integrity target won't accept
			
 
				+	integrity tags from the upper target, but it will automatically
			
 
				+	generate and verify the integrity tags.
			
 
				+
			
 
				+	You can use a crc algorithm (such as crc32), then integrity target
			
 
				+	will protect the data against accidental corruption.
			
 
				+	You can also use a hmac algorithm (for example
			
 
				+	"hmac(sha256):0123456789abcdef"), in this mode it will provide
			
 
				+	cryptographic authentication of the data without encryption.
			
 
				+
			
 
				+	When this argument is not used, the integrity tags are accepted
			
 
				+	from an upper layer target, such as dm-crypt. The upper layer
			
 
				+	target should check the validity of the integrity tags.
			
 
				+
			
 
				+journal_crypt:algorithm(:key)	(the key is optional)
			
 
				+	Encrypt the journal using given algorithm to make sure that the
			
 
				+	attacker can't read the journal. You can use a block cipher here
			
 
				+	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
			
 
				+	"salsa20", "ctr(aes)" or "ecb(arc4)").
			
 
				+
			
 
				+	The journal contains history of last writes to the block device,
			
 
				+	an attacker reading the journal could see the last sector nubmers
			
 
				+	that were written. From the sector numbers, the attacker can infer
			
 
				+	the size of files that were written. To protect against this
			
 
				+	situation, you can encrypt the journal.
			
 
				+
			
 
				+journal_mac:algorithm(:key)	(the key is optional)
			
 
				+	Protect sector numbers in the journal from accidental or malicious
			
 
				+	modification. To protect against accidental modification, use a
			
 
				+	crc algorithm, to protect against malicious modification, use a
			
 
				+	hmac algorithm with a key.
			
 
				+
			
 
				+	This option is not needed when using internal-hash because in this
			
 
				+	mode, the integrity of journal entries is checked when replaying
			
 
				+	the journal. Thus, modified sector number would be detected at
			
 
				+	this stage.
			
 
				+
			
 
				+block_size:number
			
 
				+	The size of a data block in bytes.  The larger the block size the
			
 
				+	less overhead there is for per-block integrity metadata.
			
 
				+	Supported values are 512, 1024, 2048 and 4096 bytes.  If not
			
 
				+	specified the default block size is 512 bytes.
			
 
				+
			
 
				+The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
			
 
				+be changed when reloading the target (load an inactive table and swap the
			
 
				+tables with suspend and resume). The other arguments should not be changed
			
 
				+when reloading the target because the layout of disk data depend on them
			
 
				+and the reloaded target would be non-functional.
			
 
				+
			
 
				+
			
 
				+The layout of the formatted block device:
			
 
				+* reserved sectors (they are not used by this target, they can be used for
			
 
				+  storing LUKS metadata or for other purpose), the size of the reserved
			
 
				+  area is specified in the target arguments
			
 
				+* superblock (4kiB)
			
 
				+	* magic string - identifies that the device was formatted
			
 
				+	* version
			
 
				+	* log2(interleave sectors)
			
 
				+	* integrity tag size
			
 
				+	* the number of journal sections
			
 
				+	* provided data sectors - the number of sectors that this target
			
 
				+	  provides (i.e. the size of the device minus the size of all
			
 
				+	  metadata and padding). The user of this target should not send
			
 
				+	  bios that access data beyond the "provided data sectors" limit.
			
 
				+	* flags - a flag is set if journal_mac is used
			
 
				+* journal
			
 
				+	The journal is divided into sections, each section contains:
			
 
				+	* metadata area (4kiB), it contains journal entries
			
 
				+	  every journal entry contains:
			
 
				+		* logical sector (specifies where the data and tag should
			
 
				+		  be written)
			
 
				+		* last 8 bytes of data
			
 
				+		* integrity tag (the size is specified in the superblock)
			
 
				+	    every metadata sector ends with
			
 
				+		* mac (8-bytes), all the macs in 8 metadata sectors form a
			
 
				+		  64-byte value. It is used to store hmac of sector
			
 
				+		  numbers in the journal section, to protect against a
			
 
				+		  possibility that the attacker tampers with sector
			
 
				+		  numbers in the journal.
			
 
				+		* commit id
			
 
				+	* data area (the size is variable; it depends on how many journal
			
 
				+	  entries fit into the metadata area)
			
 
				+	    every sector in the data area contains:
			
 
				+		* data (504 bytes of data, the last 8 bytes are stored in
			
 
				+		  the journal entry)
			
 
				+		* commit id
			
 
				+	To test if the whole journal section was written correctly, every
			
 
				+	512-byte sector of the journal ends with 8-byte commit id. If the
			
 
				+	commit id matches on all sectors in a journal section, then it is
			
 
				+	assumed that the section was written correctly. If the commit id
			
 
				+	doesn't match, the section was written partially and it should not
			
 
				+	be replayed.
			
 
				+* one or more runs of interleaved tags and data. Each run contains:
			
 
				+	* tag area - it contains integrity tags. There is one tag for each
			
 
				+	  sector in the data area
			
 
				+	* data area - it contains data sectors. The number of data sectors
			
 
				+	  in one run must be a power of two. log2 of this value is stored
			
 
				+	  in the superblock.
			
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
 
				 		Takeover/reshape is not possible with a raid4/5/6 journal device;
			
 
				 		it has to be deconfigured before requesting these.
			
 
				 
			
 
				+	[journal_mode <mode>]
			
 
				+		This option sets the caching mode on journaled raid4/5/6 raid sets
			
 
				+		(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
			
 
				+		If 'writeback' is selected the journal device has to be resilient
			
 
				+		and must not suffer from the 'write hole' problem itself (e.g. use
			
 
				+		raid1 or raid10) to avoid a single point of failure.
			
 
				+
			
 
				 <#raid_devs>: The number of devices composing the array.
			
 
				 	Each device consists of two entries.  The first is the device
			
 
				 	containing the metadata (if any); the second is the one containing the
			
@@ -254,7 +261,8 @@ recovery.  Here is a fuller description of the individual fields:
 
				 	<data_offset>   The current data offset to the start of the user data on
			
 
				 			each component device of a raid set (see the respective
			
 
				 			raid parameter to support out-of-place reshaping).
			
 
				-	<journal_char>	'A' - active raid4/5/6 journal device.
			
 
				+	<journal_char>	'A' - active write-through journal device.
			
 
				+			'a' - active write-back journal device.
			
 
				 			'D' - dead journal device.
			
 
				 			'-' - no journal device.
			
 
				 
			
@@ -331,3 +339,7 @@ Version History
 
				 	'D' on the status line.  If '- -' is passed into the constructor, emit
			
 
				 	'- -' on the table line and '-' as the status line health character.
			
 
				 1.10.0  Add support for raid4/5/6 journal device
			
 
				+1.10.1  Fix data corruption on reshape request
			
 
				+1.11.0  Fix table line argument order
			
 
				+	(wrong raid10_copies/raid10_format sequence)
			
 
				+1.11.1  Add raid4/5/6 journal write-back support via journal_mode option
			
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ
 
				          of less memory utilization, improved performance and increased
			
 
				          adaptability in the face of changing workloads.
			
 
				 
			
 
				-config DM_CACHE_CLEANER
			
 
				-       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
			
 
				-       depends on DM_CACHE
			
 
				-       default y
			
 
				-       ---help---
			
 
				-         A simple cache policy that writes back all data to the
			
 
				-         origin.  Used when decommissioning a dm-cache.
			
 
				-
			
 
				 config DM_ERA
			
 
				        tristate "Era target (EXPERIMENTAL)"
			
 
				        depends on BLK_DEV_DM
			
@@ -365,6 +357,7 @@ config DM_LOG_USERSPACE
 
				 config DM_RAID
			
 
				        tristate "RAID 1/4/5/6/10 target"
			
 
				        depends on BLK_DEV_DM
			
 
				+       select MD_RAID0
			
 
				        select MD_RAID1
			
 
				        select MD_RAID10
			
 
				        select MD_RAID456
			
@@ -508,4 +501,14 @@ config DM_LOG_WRITES
 
				 
			
 
				 	  If unsure, say N.
			
 
				 
			
 
				+config DM_INTEGRITY
			
 
				+	tristate "Integrity target"
			
 
				+	depends on BLK_DEV_DM
			
 
				+	select BLK_DEV_INTEGRITY
			
 
				+	select DM_BUFIO
			
 
				+	select CRYPTO
			
 
				+	select ASYNC_XOR
			
 
				+	---help---
			
 
				+	   This is the integrity target.
			
 
				+
			
 
				 endif # MD
			
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,10 +11,11 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 
				 dm-mirror-y	+= dm-raid1.o
			
 
				 dm-log-userspace-y \
			
 
				 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
			
 
				+dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
			
 
				 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
			
 
				-dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
			
 
				+dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
			
 
				+		    dm-cache-background-tracker.o
			
 
				 dm-cache-smq-y   += dm-cache-policy-smq.o
			
 
				-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
			
 
				 dm-era-y	+= dm-era-target.o
			
 
				 dm-verity-y	+= dm-verity-target.o
			
 
				 md-mod-y	+= md.o bitmap.o
			
@@ -56,9 +57,9 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 
				 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
			
 
				 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
			
 
				 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
			
 
				-obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
			
 
				 obj-$(CONFIG_DM_ERA)		+= dm-era.o
			
 
				 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
			
 
				+obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
			
 
				 
			
 
				 ifeq ($(CONFIG_DM_UEVENT),y)
			
 
				 dm-mod-objs			+= dm-uevent.o
			
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -5,7 +5,8 @@
 
				  */
			
 
				 
			
 
				 #include "dm.h"
			
 
				-#include "dm-bio-prison.h"
			
 
				+#include "dm-bio-prison-v1.h"
			
 
				+#include "dm-bio-prison-v2.h"
			
 
				 
			
 
				 #include <linux/spinlock.h>
			
 
				 #include <linux/mempool.h>
			
@@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-static int __init dm_bio_prison_init(void)
			
 
				+static int __init dm_bio_prison_init_v1(void)
			
 
				 {
			
 
				 	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
			
 
				 	if (!_cell_cache)
			
@@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void __exit dm_bio_prison_exit(void)
			
 
				+static void dm_bio_prison_exit_v1(void)
			
 
				 {
			
 
				 	kmem_cache_destroy(_cell_cache);
			
 
				 	_cell_cache = NULL;
			
 
				 }
			
 
				 
			
 
				+static int (*_inits[])(void) __initdata = {
			
 
				+	dm_bio_prison_init_v1,
			
 
				+	dm_bio_prison_init_v2,
			
 
				+};
			
 
				+
			
 
				+static void (*_exits[])(void) = {
			
 
				+	dm_bio_prison_exit_v1,
			
 
				+	dm_bio_prison_exit_v2,
			
 
				+};
			
 
				+
			
 
				+static int __init dm_bio_prison_init(void)
			
 
				+{
			
 
				+	const int count = ARRAY_SIZE(_inits);
			
 
				+
			
 
				+	int r, i;
			
 
				+
			
 
				+	for (i = 0; i < count; i++) {
			
 
				+		r = _inits[i]();
			
 
				+		if (r)
			
 
				+			goto bad;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+      bad:
			
 
				+	while (i--)
			
 
				+		_exits[i]();
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __exit dm_bio_prison_exit(void)
			
 
				+{
			
 
				+	int i = ARRAY_SIZE(_exits);
			
 
				+
			
 
				+	while (i--)
			
 
				+		_exits[i]();
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * module hooks
			
 
				  */
			
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -1,5 +1,5 @@
 
				 /*
			
 
				- * Copyright (C) 2011-2012 Red Hat, Inc.
			
 
				+ * Copyright (C) 2011-2017 Red Hat, Inc.
			
 
				  *
			
 
				  * This file is released under the GPL.
			
 
				  */
			
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -0,0 +1,369 @@
 
				+/*
			
 
				+ * Copyright (C) 2012-2017 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm.h"
			
 
				+#include "dm-bio-prison-v2.h"
			
 
				+
			
 
				+#include <linux/spinlock.h>
			
 
				+#include <linux/mempool.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/rwsem.h>
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define MIN_CELLS 1024
			
 
				+
			
 
				+struct dm_bio_prison_v2 {
			
 
				+	struct workqueue_struct *wq;
			
 
				+
			
 
				+	spinlock_t lock;
			
 
				+	mempool_t *cell_pool;
			
 
				+	struct rb_root cells;
			
 
				+};
			
 
				+
			
 
				+static struct kmem_cache *_cell_cache;
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * @nr_cells should be the number of cells you want in use _concurrently_.
			
 
				+ * Don't confuse it with the number of distinct keys.
			
 
				+ */
			
 
				+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
			
 
				+{
			
 
				+	struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
			
 
				+
			
 
				+	if (!prison)
			
 
				+		return NULL;
			
 
				+
			
 
				+	prison->wq = wq;
			
 
				+	spin_lock_init(&prison->lock);
			
 
				+
			
 
				+	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
			
 
				+	if (!prison->cell_pool) {
			
 
				+		kfree(prison);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	prison->cells = RB_ROOT;
			
 
				+
			
 
				+	return prison;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
			
 
				+
			
 
				+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
			
 
				+{
			
 
				+	mempool_destroy(prison->cell_pool);
			
 
				+	kfree(prison);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
			
 
				+
			
 
				+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
			
 
				+{
			
 
				+	return mempool_alloc(prison->cell_pool, gfp);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
			
 
				+
			
 
				+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
			
 
				+				struct dm_bio_prison_cell_v2 *cell)
			
 
				+{
			
 
				+	mempool_free(cell, prison->cell_pool);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
			
 
				+
			
 
				+static void __setup_new_cell(struct dm_cell_key_v2 *key,
			
 
				+			     struct dm_bio_prison_cell_v2 *cell)
			
 
				+{
			
 
				+	memset(cell, 0, sizeof(*cell));
			
 
				+	memcpy(&cell->key, key, sizeof(cell->key));
			
 
				+	bio_list_init(&cell->bios);
			
 
				+}
			
 
				+
			
 
				+static int cmp_keys(struct dm_cell_key_v2 *lhs,
			
 
				+		    struct dm_cell_key_v2 *rhs)
			
 
				+{
			
 
				+	if (lhs->virtual < rhs->virtual)
			
 
				+		return -1;
			
 
				+
			
 
				+	if (lhs->virtual > rhs->virtual)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (lhs->dev < rhs->dev)
			
 
				+		return -1;
			
 
				+
			
 
				+	if (lhs->dev > rhs->dev)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (lhs->block_end <= rhs->block_begin)
			
 
				+		return -1;
			
 
				+
			
 
				+	if (lhs->block_begin >= rhs->block_end)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Returns true if node found, otherwise it inserts a new one.
			
 
				+ */
			
 
				+static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
			
 
				+			     struct dm_cell_key_v2 *key,
			
 
				+			     struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+			     struct dm_bio_prison_cell_v2 **result)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
			
 
				+
			
 
				+	while (*new) {
			
 
				+		struct dm_bio_prison_cell_v2 *cell =
			
 
				+			container_of(*new, struct dm_bio_prison_cell_v2, node);
			
 
				+
			
 
				+		r = cmp_keys(key, &cell->key);
			
 
				+
			
 
				+		parent = *new;
			
 
				+		if (r < 0)
			
 
				+			new = &((*new)->rb_left);
			
 
				+
			
 
				+		else if (r > 0)
			
 
				+			new = &((*new)->rb_right);
			
 
				+
			
 
				+		else {
			
 
				+			*result = cell;
			
 
				+			return true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	__setup_new_cell(key, cell_prealloc);
			
 
				+	*result = cell_prealloc;
			
 
				+	rb_link_node(&cell_prealloc->node, parent, new);
			
 
				+	rb_insert_color(&cell_prealloc->node, &prison->cells);
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static bool __get(struct dm_bio_prison_v2 *prison,
			
 
				+		  struct dm_cell_key_v2 *key,
			
 
				+		  unsigned lock_level,
			
 
				+		  struct bio *inmate,
			
 
				+		  struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+		  struct dm_bio_prison_cell_v2 **cell)
			
 
				+{
			
 
				+	if (__find_or_insert(prison, key, cell_prealloc, cell)) {
			
 
				+		if ((*cell)->exclusive_lock) {
			
 
				+			if (lock_level <= (*cell)->exclusive_level) {
			
 
				+				bio_list_add(&(*cell)->bios, inmate);
			
 
				+				return false;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		(*cell)->shared_count++;
			
 
				+
			
 
				+	} else
			
 
				+		(*cell)->shared_count = 1;
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		    struct dm_cell_key_v2 *key,
			
 
				+		    unsigned lock_level,
			
 
				+		    struct bio *inmate,
			
 
				+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+		    struct dm_bio_prison_cell_v2 **cell_result)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_get_v2);
			
 
				+
			
 
				+static bool __put(struct dm_bio_prison_v2 *prison,
			
 
				+		  struct dm_bio_prison_cell_v2 *cell)
			
 
				+{
			
 
				+	BUG_ON(!cell->shared_count);
			
 
				+	cell->shared_count--;
			
 
				+
			
 
				+	// FIXME: shared locks granted above the lock level could starve this
			
 
				+	if (!cell->shared_count) {
			
 
				+		if (cell->exclusive_lock){
			
 
				+			if (cell->quiesce_continuation) {
			
 
				+				queue_work(prison->wq, cell->quiesce_continuation);
			
 
				+				cell->quiesce_continuation = NULL;
			
 
				+			}
			
 
				+		} else {
			
 
				+			rb_erase(&cell->node, &prison->cells);
			
 
				+			return true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		    struct dm_bio_prison_cell_v2 *cell)
			
 
				+{
			
 
				+	bool r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __put(prison, cell);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_put_v2);
			
 
				+
			
 
				+static int __lock(struct dm_bio_prison_v2 *prison,
			
 
				+		  struct dm_cell_key_v2 *key,
			
 
				+		  unsigned lock_level,
			
 
				+		  struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+		  struct dm_bio_prison_cell_v2 **cell_result)
			
 
				+{
			
 
				+	struct dm_bio_prison_cell_v2 *cell;
			
 
				+
			
 
				+	if (__find_or_insert(prison, key, cell_prealloc, &cell)) {
			
 
				+		if (cell->exclusive_lock)
			
 
				+			return -EBUSY;
			
 
				+
			
 
				+		cell->exclusive_lock = true;
			
 
				+		cell->exclusive_level = lock_level;
			
 
				+		*cell_result = cell;
			
 
				+
			
 
				+		// FIXME: we don't yet know what level these shared locks
			
 
				+		// were taken at, so have to quiesce them all.
			
 
				+		return cell->shared_count > 0;
			
 
				+
			
 
				+	} else {
			
 
				+		cell = cell_prealloc;
			
 
				+		cell->shared_count = 0;
			
 
				+		cell->exclusive_lock = true;
			
 
				+		cell->exclusive_level = lock_level;
			
 
				+		*cell_result = cell;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		    struct dm_cell_key_v2 *key,
			
 
				+		    unsigned lock_level,
			
 
				+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+		    struct dm_bio_prison_cell_v2 **cell_result)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_lock_v2);
			
 
				+
			
 
				+static void __quiesce(struct dm_bio_prison_v2 *prison,
			
 
				+		      struct dm_bio_prison_cell_v2 *cell,
			
 
				+		      struct work_struct *continuation)
			
 
				+{
			
 
				+	if (!cell->shared_count)
			
 
				+		queue_work(prison->wq, continuation);
			
 
				+	else
			
 
				+		cell->quiesce_continuation = continuation;
			
 
				+}
			
 
				+
			
 
				+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
			
 
				+			struct dm_bio_prison_cell_v2 *cell,
			
 
				+			struct work_struct *continuation)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	__quiesce(prison, cell, continuation);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
			
 
				+
			
 
				+static int __promote(struct dm_bio_prison_v2 *prison,
			
 
				+		     struct dm_bio_prison_cell_v2 *cell,
			
 
				+		     unsigned new_lock_level)
			
 
				+{
			
 
				+	if (!cell->exclusive_lock)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	cell->exclusive_level = new_lock_level;
			
 
				+	return cell->shared_count > 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
			
 
				+			    struct dm_bio_prison_cell_v2 *cell,
			
 
				+			    unsigned new_lock_level)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __promote(prison, cell, new_lock_level);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2);
			
 
				+
			
 
				+static bool __unlock(struct dm_bio_prison_v2 *prison,
			
 
				+		     struct dm_bio_prison_cell_v2 *cell,
			
 
				+		     struct bio_list *bios)
			
 
				+{
			
 
				+	BUG_ON(!cell->exclusive_lock);
			
 
				+
			
 
				+	bio_list_merge(bios, &cell->bios);
			
 
				+	bio_list_init(&cell->bios);
			
 
				+
			
 
				+	if (cell->shared_count) {
			
 
				+		cell->exclusive_lock = 0;
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	rb_erase(&cell->node, &prison->cells);
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		       struct dm_bio_prison_cell_v2 *cell,
			
 
				+		       struct bio_list *bios)
			
 
				+{
			
 
				+	bool r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __unlock(prison, cell, bios);
			
 
				+	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cell_unlock_v2);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+int __init dm_bio_prison_init_v2(void)
			
 
				+{
			
 
				+	_cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0);
			
 
				+	if (!_cell_cache)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void dm_bio_prison_exit_v2(void)
			
 
				+{
			
 
				+	kmem_cache_destroy(_cell_cache);
			
 
				+	_cell_cache = NULL;
			
 
				+}
			
--- a/drivers/md/dm-bio-prison-v2.h
+++ b/drivers/md/dm-bio-prison-v2.h
@@ -0,0 +1,152 @@
 
				+/*
			
 
				+ * Copyright (C) 2011-2017 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef DM_BIO_PRISON_V2_H
			
 
				+#define DM_BIO_PRISON_V2_H
			
 
				+
			
 
				+#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
			
 
				+#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
			
 
				+
			
 
				+#include <linux/bio.h>
			
 
				+#include <linux/rbtree.h>
			
 
				+#include <linux/workqueue.h>
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+int dm_bio_prison_init_v2(void);
			
 
				+void dm_bio_prison_exit_v2(void);
			
 
				+
			
 
				+/*
			
 
				+ * Sometimes we can't deal with a bio straight away.  We put them in prison
			
 
				+ * where they can't cause any mischief.  Bios are put in a cell identified
			
 
				+ * by a key, multiple bios can be in the same cell.  When the cell is
			
 
				+ * subsequently unlocked the bios become available.
			
 
				+ */
			
 
				+struct dm_bio_prison_v2;
			
 
				+
			
 
				+/*
			
 
				+ * Keys define a range of blocks within either a virtual or physical
			
 
				+ * device.
			
 
				+ */
			
 
				+struct dm_cell_key_v2 {
			
 
				+	int virtual;
			
 
				+	dm_thin_id dev;
			
 
				+	dm_block_t block_begin, block_end;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Treat this as opaque, only in header so callers can manage allocation
			
 
				+ * themselves.
			
 
				+ */
			
 
				+struct dm_bio_prison_cell_v2 {
			
 
				+	// FIXME: pack these
			
 
				+	bool exclusive_lock;
			
 
				+	unsigned exclusive_level;
			
 
				+	unsigned shared_count;
			
 
				+	struct work_struct *quiesce_continuation;
			
 
				+
			
 
				+	struct rb_node node;
			
 
				+	struct dm_cell_key_v2 key;
			
 
				+	struct bio_list bios;
			
 
				+};
			
 
				+
			
 
				+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq);
			
 
				+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison);
			
 
				+
			
 
				+/*
			
 
				+ * These two functions just wrap a mempool.  This is a transitory step:
			
 
				+ * Eventually all bio prison clients should manage their own cell memory.
			
 
				+ *
			
 
				+ * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called
			
 
				+ * in interrupt context or passed GFP_NOWAIT.
			
 
				+ */
			
 
				+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison,
			
 
				+						    gfp_t gfp);
			
 
				+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
			
 
				+				struct dm_bio_prison_cell_v2 *cell);
			
 
				+
			
 
				+/*
			
 
				+ * Shared locks have a bio associated with them.
			
 
				+ *
			
 
				+ * If the lock is granted the caller can continue to use the bio, and must
			
 
				+ * call dm_cell_put_v2() to drop the reference count when finished using it.
			
 
				+ *
			
 
				+ * If the lock cannot be granted then the bio will be tracked within the
			
 
				+ * cell, and later given to the holder of the exclusive lock.
			
 
				+ *
			
 
				+ * See dm_cell_lock_v2() for discussion of the lock_level parameter.
			
 
				+ *
			
 
				+ * Compare *cell_result with cell_prealloc to see if the prealloc was used.
			
 
				+ * If cell_prealloc was used then inmate wasn't added to it.
			
 
				+ *
			
 
				+ * Returns true if the lock is granted.
			
 
				+ */
			
 
				+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		    struct dm_cell_key_v2 *key,
			
 
				+		    unsigned lock_level,
			
 
				+		    struct bio *inmate,
			
 
				+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+		    struct dm_bio_prison_cell_v2 **cell_result);
			
 
				+
			
 
				+/*
			
 
				+ * Decrement the shared reference count for the lock.  Returns true if
			
 
				+ * returning ownership of the cell (ie. you should free it).
			
 
				+ */
			
 
				+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		    struct dm_bio_prison_cell_v2 *cell);
			
 
				+
			
 
				+/*
			
 
				+ * Locks a cell.  No associated bio.  Exclusive locks get priority.  These
			
 
				+ * locks constrain whether the io locks are granted according to level.
			
 
				+ *
			
 
				+ * Shared locks will still be granted if the lock_level is > (not = to) the
			
 
				+ * exclusive lock level.
			
 
				+ *
			
 
				+ * If an _exclusive_ lock is already held then -EBUSY is returned.
			
 
				+ *
			
 
				+ * Return values:
			
 
				+ *  < 0 - error
			
 
				+ *  0   - locked; no quiescing needed
			
 
				+ *  1   - locked; quiescing needed
			
 
				+ */
			
 
				+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		    struct dm_cell_key_v2 *key,
			
 
				+		    unsigned lock_level,
			
 
				+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
			
 
				+		    struct dm_bio_prison_cell_v2 **cell_result);
			
 
				+
			
 
				+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
			
 
				+			struct dm_bio_prison_cell_v2 *cell,
			
 
				+			struct work_struct *continuation);
			
 
				+
			
 
				+/*
			
 
				+ * Promotes an _exclusive_ lock to a higher lock level.
			
 
				+ *
			
 
				+ * Return values:
			
 
				+ *  < 0 - error
			
 
				+ *  0   - promoted; no quiescing needed
			
 
				+ *  1   - promoted; quiescing needed
			
 
				+ */
			
 
				+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
			
 
				+			    struct dm_bio_prison_cell_v2 *cell,
			
 
				+			    unsigned new_lock_level);
			
 
				+
			
 
				+/*
			
 
				+ * Adds any held bios to the bio list.
			
 
				+ *
			
 
				+ * There may be shared locks still held at this point even if you quiesced
			
 
				+ * (ie. different lock levels).
			
 
				+ *
			
 
				+ * Returns true if returning ownership of the cell (ie. you should free
			
 
				+ * it).
			
 
				+ */
			
 
				+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
			
 
				+		       struct dm_bio_prison_cell_v2 *cell,
			
 
				+		       struct bio_list *bios);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif
			
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -110,6 +110,8 @@ struct dm_bufio_client {
 
				 	struct rb_root buffer_tree;
			
 
				 	wait_queue_head_t free_buffer_wait;
			
 
				 
			
 
				+	sector_t start;
			
 
				+
			
 
				 	int async_write_error;
			
 
				 
			
 
				 	struct list_head client_list;
			
@@ -557,8 +559,8 @@ static void dmio_complete(unsigned long error, void *context)
 
				 	b->bio.bi_end_io(&b->bio);
			
 
				 }
			
 
				 
			
 
				-static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
			
 
				-		     bio_end_io_t *end_io)
			
 
				+static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
			
 
				+		     unsigned n_sectors, bio_end_io_t *end_io)
			
 
				 {
			
 
				 	int r;
			
 
				 	struct dm_io_request io_req = {
			
@@ -570,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 
				 	};
			
 
				 	struct dm_io_region region = {
			
 
				 		.bdev = b->c->bdev,
			
 
				-		.sector = block << b->c->sectors_per_block_bits,
			
 
				-		.count = b->c->block_size >> SECTOR_SHIFT,
			
 
				+		.sector = sector,
			
 
				+		.count = n_sectors,
			
 
				 	};
			
 
				 
			
 
				 	if (b->data_mode != DATA_MODE_VMALLOC) {
			
@@ -606,14 +608,14 @@ static void inline_endio(struct bio *bio)
 
				 	end_fn(bio);
			
 
				 }
			
 
				 
			
 
				-static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
			
 
				-			   bio_end_io_t *end_io)
			
 
				+static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
			
 
				+			   unsigned n_sectors, bio_end_io_t *end_io)
			
 
				 {
			
 
				 	char *ptr;
			
 
				 	int len;
			
 
				 
			
 
				 	bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
			
 
				-	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
			
 
				+	b->bio.bi_iter.bi_sector = sector;
			
 
				 	b->bio.bi_bdev = b->c->bdev;
			
 
				 	b->bio.bi_end_io = inline_endio;
			
 
				 	/*
			
@@ -628,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 
				 	 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
			
 
				 	 */
			
 
				 	ptr = b->data;
			
 
				-	len = b->c->block_size;
			
 
				+	len = n_sectors << SECTOR_SHIFT;
			
 
				 
			
 
				 	if (len >= PAGE_SIZE)
			
 
				 		BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
			
@@ -640,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 
				 				  len < PAGE_SIZE ? len : PAGE_SIZE,
			
 
				 				  offset_in_page(ptr))) {
			
 
				 			BUG_ON(b->c->block_size <= PAGE_SIZE);
			
 
				-			use_dmio(b, rw, block, end_io);
			
 
				+			use_dmio(b, rw, sector, n_sectors, end_io);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
@@ -651,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 
				 	submit_bio(&b->bio);
			
 
				 }
			
 
				 
			
 
				-static void submit_io(struct dm_buffer *b, int rw, sector_t block,
			
 
				-		      bio_end_io_t *end_io)
			
 
				+static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
			
 
				 {
			
 
				+	unsigned n_sectors;
			
 
				+	sector_t sector;
			
 
				+
			
 
				 	if (rw == WRITE && b->c->write_callback)
			
 
				 		b->c->write_callback(b);
			
 
				 
			
 
				-	if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
			
 
				+	sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
			
 
				+	n_sectors = 1 << b->c->sectors_per_block_bits;
			
 
				+
			
 
				+	if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
			
 
				 	    b->data_mode != DATA_MODE_VMALLOC)
			
 
				-		use_inline_bio(b, rw, block, end_io);
			
 
				+		use_inline_bio(b, rw, sector, n_sectors, end_io);
			
 
				 	else
			
 
				-		use_dmio(b, rw, block, end_io);
			
 
				+		use_dmio(b, rw, sector, n_sectors, end_io);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------
			
@@ -713,7 +720,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
 
				 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
			
 
				 
			
 
				 	if (!write_list)
			
 
				-		submit_io(b, WRITE, b->block, write_endio);
			
 
				+		submit_io(b, WRITE, write_endio);
			
 
				 	else
			
 
				 		list_add_tail(&b->write_list, write_list);
			
 
				 }
			
@@ -726,7 +733,7 @@ static void __flush_write_list(struct list_head *write_list)
 
				 		struct dm_buffer *b =
			
 
				 			list_entry(write_list->next, struct dm_buffer, write_list);
			
 
				 		list_del(&b->write_list);
			
 
				-		submit_io(b, WRITE, b->block, write_endio);
			
 
				+		submit_io(b, WRITE, write_endio);
			
 
				 		cond_resched();
			
 
				 	}
			
 
				 	blk_finish_plug(&plug);
			
@@ -933,10 +940,11 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 
				 {
			
 
				 	unsigned long buffers;
			
 
				 
			
 
				-	if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
			
 
				-		mutex_lock(&dm_bufio_clients_lock);
			
 
				-		__cache_size_refresh();
			
 
				-		mutex_unlock(&dm_bufio_clients_lock);
			
 
				+	if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
			
 
				+		if (mutex_trylock(&dm_bufio_clients_lock)) {
			
 
				+			__cache_size_refresh();
			
 
				+			mutex_unlock(&dm_bufio_clients_lock);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	buffers = dm_bufio_cache_size_per_client >>
			
@@ -1094,7 +1102,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 
				 		return NULL;
			
 
				 
			
 
				 	if (need_submit)
			
 
				-		submit_io(b, READ, b->block, read_endio);
			
 
				+		submit_io(b, READ, read_endio);
			
 
				 
			
 
				 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
			
 
				 
			
@@ -1164,7 +1172,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 
				 			dm_bufio_unlock(c);
			
 
				 
			
 
				 			if (need_submit)
			
 
				-				submit_io(b, READ, b->block, read_endio);
			
 
				+				submit_io(b, READ, read_endio);
			
 
				 			dm_bufio_release(b);
			
 
				 
			
 
				 			cond_resched();
			
@@ -1405,7 +1413,7 @@ retry:
 
				 		old_block = b->block;
			
 
				 		__unlink_buffer(b);
			
 
				 		__link_buffer(b, new_block, b->list_mode);
			
 
				-		submit_io(b, WRITE, new_block, write_endio);
			
 
				+		submit_io(b, WRITE, write_endio);
			
 
				 		wait_on_bit_io(&b->state, B_WRITING,
			
 
				 			       TASK_UNINTERRUPTIBLE);
			
 
				 		__unlink_buffer(b);
			
@@ -1762,6 +1770,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
			
 
				 
			
 
				+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
			
 
				+{
			
 
				+	c->start = start;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
			
 
				+
			
 
				 static unsigned get_max_age_hz(void)
			
 
				 {
			
 
				 	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
			
@@ -1782,9 +1796,17 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
 
				 	struct dm_buffer *b, *tmp;
			
 
				 	unsigned retain_target = get_retain_buffers(c);
			
 
				 	unsigned count;
			
 
				+	LIST_HEAD(write_list);
			
 
				 
			
 
				 	dm_bufio_lock(c);
			
 
				 
			
 
				+	__check_watermark(c, &write_list);
			
 
				+	if (unlikely(!list_empty(&write_list))) {
			
 
				+		dm_bufio_unlock(c);
			
 
				+		__flush_write_list(&write_list);
			
 
				+		dm_bufio_lock(c);
			
 
				+	}
			
 
				+
			
 
				 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
			
 
				 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
			
 
				 		if (count <= retain_target)
			
@@ -1809,6 +1831,8 @@ static void cleanup_old_buffers(void)
 
				 
			
 
				 	mutex_lock(&dm_bufio_clients_lock);
			
 
				 
			
 
				+	__cache_size_refresh();
			
 
				+
			
 
				 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
			
 
				 		__evict_old_buffers(c, max_age_hz);
			
 
				 
			
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -31,6 +31,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 
				  */
			
 
				 void dm_bufio_client_destroy(struct dm_bufio_client *c);
			
 
				 
			
 
				+/*
			
 
				+ * Set the sector range.
			
 
				+ * When this function is called, there must be no I/O in progress on the bufio
			
 
				+ * client.
			
 
				+ */
			
 
				+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start);
			
 
				+
			
 
				 /*
			
 
				  * WARNING: to avoid deadlocks, these conditions are observed:
			
 
				  *
			
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -0,0 +1,238 @@
 
				+/*
			
 
				+ * Copyright (C) 2017 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-cache-background-tracker.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define DM_MSG_PREFIX "dm-background-tracker"
			
 
				+
			
 
				+struct bt_work {
			
 
				+	struct list_head list;
			
 
				+	struct rb_node node;
			
 
				+	struct policy_work work;
			
 
				+};
			
 
				+
			
 
				+struct background_tracker {
			
 
				+	unsigned max_work;
			
 
				+	atomic_t pending_promotes;
			
 
				+	atomic_t pending_writebacks;
			
 
				+	atomic_t pending_demotes;
			
 
				+
			
 
				+	struct list_head issued;
			
 
				+	struct list_head queued;
			
 
				+	struct rb_root pending;
			
 
				+
			
 
				+	struct kmem_cache *work_cache;
			
 
				+};
			
 
				+
			
 
				+struct background_tracker *btracker_create(unsigned max_work)
			
 
				+{
			
 
				+	struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
			
 
				+
			
 
				+	b->max_work = max_work;
			
 
				+	atomic_set(&b->pending_promotes, 0);
			
 
				+	atomic_set(&b->pending_writebacks, 0);
			
 
				+	atomic_set(&b->pending_demotes, 0);
			
 
				+
			
 
				+	INIT_LIST_HEAD(&b->issued);
			
 
				+	INIT_LIST_HEAD(&b->queued);
			
 
				+
			
 
				+	b->pending = RB_ROOT;
			
 
				+	b->work_cache = KMEM_CACHE(bt_work, 0);
			
 
				+	if (!b->work_cache) {
			
 
				+		DMERR("couldn't create mempool for background work items");
			
 
				+		kfree(b);
			
 
				+		b = NULL;
			
 
				+	}
			
 
				+
			
 
				+	return b;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_create);
			
 
				+
			
 
				+void btracker_destroy(struct background_tracker *b)
			
 
				+{
			
 
				+	kmem_cache_destroy(b->work_cache);
			
 
				+	kfree(b);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_destroy);
			
 
				+
			
 
				+static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
			
 
				+{
			
 
				+	if (from_oblock(lhs) < from_oblock(rhs))
			
 
				+		return -1;
			
 
				+
			
 
				+	if (from_oblock(rhs) < from_oblock(lhs))
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static bool __insert_pending(struct background_tracker *b,
			
 
				+			     struct bt_work *nw)
			
 
				+{
			
 
				+	int cmp;
			
 
				+	struct bt_work *w;
			
 
				+	struct rb_node **new = &b->pending.rb_node, *parent = NULL;
			
 
				+
			
 
				+	while (*new) {
			
 
				+		w = container_of(*new, struct bt_work, node);
			
 
				+
			
 
				+		parent = *new;
			
 
				+		cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
			
 
				+		if (cmp < 0)
			
 
				+			new = &((*new)->rb_left);
			
 
				+
			
 
				+		else if (cmp > 0)
			
 
				+			new = &((*new)->rb_right);
			
 
				+
			
 
				+		else
			
 
				+			/* already present */
			
 
				+			return false;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&nw->node, parent, new);
			
 
				+	rb_insert_color(&nw->node, &b->pending);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static struct bt_work *__find_pending(struct background_tracker *b,
			
 
				+				      dm_oblock_t oblock)
			
 
				+{
			
 
				+	int cmp;
			
 
				+	struct bt_work *w;
			
 
				+	struct rb_node **new = &b->pending.rb_node;
			
 
				+
			
 
				+	while (*new) {
			
 
				+		w = container_of(*new, struct bt_work, node);
			
 
				+
			
 
				+		cmp = cmp_oblock(w->work.oblock, oblock);
			
 
				+		if (cmp < 0)
			
 
				+			new = &((*new)->rb_left);
			
 
				+
			
 
				+		else if (cmp > 0)
			
 
				+			new = &((*new)->rb_right);
			
 
				+
			
 
				+		else
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return *new ? w : NULL;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
			
 
				+{
			
 
				+	switch (w->op) {
			
 
				+	case POLICY_PROMOTE:
			
 
				+		atomic_add(delta, &b->pending_promotes);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_DEMOTE:
			
 
				+		atomic_add(delta, &b->pending_demotes);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_WRITEBACK:
			
 
				+		atomic_add(delta, &b->pending_writebacks);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
			
 
				+{
			
 
				+	return atomic_read(&b->pending_writebacks);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
			
 
				+
			
 
				+unsigned btracker_nr_demotions_queued(struct background_tracker *b)
			
 
				+{
			
 
				+	return atomic_read(&b->pending_demotes);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
			
 
				+
			
 
				+static bool max_work_reached(struct background_tracker *b)
			
 
				+{
			
 
				+	// FIXME: finish
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+int btracker_queue(struct background_tracker *b,
			
 
				+		   struct policy_work *work,
			
 
				+		   struct policy_work **pwork)
			
 
				+{
			
 
				+	struct bt_work *w;
			
 
				+
			
 
				+	if (pwork)
			
 
				+		*pwork = NULL;
			
 
				+
			
 
				+	if (max_work_reached(b))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
			
 
				+	if (!w)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	memcpy(&w->work, work, sizeof(*work));
			
 
				+
			
 
				+	if (!__insert_pending(b, w)) {
			
 
				+		/*
			
 
				+		 * There was a race, we'll just ignore this second
			
 
				+		 * bit of work for the same oblock.
			
 
				+		 */
			
 
				+		kmem_cache_free(b->work_cache, w);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (pwork) {
			
 
				+		*pwork = &w->work;
			
 
				+		list_add(&w->list, &b->issued);
			
 
				+	} else
			
 
				+		list_add(&w->list, &b->queued);
			
 
				+	update_stats(b, &w->work, 1);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_queue);
			
 
				+
			
 
				+/*
			
 
				+ * Returns -ENODATA if there's no work.
			
 
				+ */
			
 
				+int btracker_issue(struct background_tracker *b, struct policy_work **work)
			
 
				+{
			
 
				+	struct bt_work *w;
			
 
				+
			
 
				+	if (list_empty(&b->queued))
			
 
				+		return -ENODATA;
			
 
				+
			
 
				+	w = list_first_entry(&b->queued, struct bt_work, list);
			
 
				+	list_move(&w->list, &b->issued);
			
 
				+	*work = &w->work;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_issue);
			
 
				+
			
 
				+void btracker_complete(struct background_tracker *b,
			
 
				+		       struct policy_work *op)
			
 
				+{
			
 
				+	struct bt_work *w = container_of(op, struct bt_work, work);
			
 
				+
			
 
				+	update_stats(b, &w->work, -1);
			
 
				+	rb_erase(&w->node, &b->pending);
			
 
				+	list_del(&w->list);
			
 
				+	kmem_cache_free(b->work_cache, w);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_complete);
			
 
				+
			
 
				+bool btracker_promotion_already_present(struct background_tracker *b,
			
 
				+					dm_oblock_t oblock)
			
 
				+{
			
 
				+	return __find_pending(b, oblock) != NULL;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
--- a/drivers/md/dm-cache-background-tracker.h
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -0,0 +1,46 @@
 
				+/*
			
 
				+ * Copyright (C) 2017 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef DM_CACHE_BACKGROUND_WORK_H
			
 
				+#define DM_CACHE_BACKGROUND_WORK_H
			
 
				+
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include "dm-cache-policy.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct background_work;
			
 
				+struct background_tracker;
			
 
				+
			
 
				+/*
			
 
				+ * FIXME: discuss lack of locking in all methods.
			
 
				+ */
			
 
				+struct background_tracker *btracker_create(unsigned max_work);
			
 
				+void btracker_destroy(struct background_tracker *b);
			
 
				+
			
 
				+unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
			
 
				+unsigned btracker_nr_demotions_queued(struct background_tracker *b);
			
 
				+
			
 
				+/*
			
 
				+ * returns -EINVAL iff the work is already queued.  -ENOMEM if the work
			
 
				+ * couldn't be queued for another reason.
			
 
				+ */
			
 
				+int btracker_queue(struct background_tracker *b,
			
 
				+		   struct policy_work *work,
			
 
				+		   struct policy_work **pwork);
			
 
				+
			
 
				+/*
			
 
				+ * Returns -ENODATA if there's no work.
			
 
				+ */
			
 
				+int btracker_issue(struct background_tracker *b, struct policy_work **work);
			
 
				+void btracker_complete(struct background_tracker *b,
			
 
				+		       struct policy_work *op);
			
 
				+bool btracker_promotion_already_present(struct background_tracker *b,
			
 
				+					dm_oblock_t oblock);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif
			
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -27,8 +27,6 @@
 
				 #define MIN_CACHE_VERSION 1
			
 
				 #define MAX_CACHE_VERSION 2
			
 
				 
			
 
				-#define CACHE_METADATA_CACHE_SIZE 64
			
 
				-
			
 
				 /*
			
 
				  *  3 for btree insert +
			
 
				  *  2 for btree lookup used within space map
			
@@ -535,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
 
				 {
			
 
				 	int r;
			
 
				 	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
			
 
				-					  CACHE_METADATA_CACHE_SIZE,
			
 
				 					  CACHE_MAX_CONCURRENT_LOCKS);
			
 
				 	if (IS_ERR(cmd->bm)) {
			
 
				 		DMERR("could not create block manager");
			
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -50,6 +50,8 @@
 
				 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP	  0UL
			
 
				 #define DM_CACHE_FEATURE_INCOMPAT_SUPP	  0UL
			
 
				 
			
 
				+struct dm_cache_metadata;
			
 
				+
			
 
				 /*
			
 
				  * Reopens or creates a new, empty metadata volume.  Returns an ERR_PTR on
			
 
				  * failure.  If reopening then features must match.
			
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -1,469 +0,0 @@
 
				-/*
			
 
				- * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				- *
			
 
				- * writeback cache policy supporting flushing out dirty cache blocks.
			
 
				- *
			
 
				- * This file is released under the GPL.
			
 
				- */
			
 
				-
			
 
				-#include "dm-cache-policy.h"
			
 
				-#include "dm.h"
			
 
				-
			
 
				-#include <linux/hash.h>
			
 
				-#include <linux/module.h>
			
 
				-#include <linux/slab.h>
			
 
				-#include <linux/vmalloc.h>
			
 
				-
			
 
				-/*----------------------------------------------------------------*/
			
 
				-
			
 
				-#define DM_MSG_PREFIX "cache cleaner"
			
 
				-
			
 
				-/* Cache entry struct. */
			
 
				-struct wb_cache_entry {
			
 
				-	struct list_head list;
			
 
				-	struct hlist_node hlist;
			
 
				-
			
 
				-	dm_oblock_t oblock;
			
 
				-	dm_cblock_t cblock;
			
 
				-	bool dirty:1;
			
 
				-	bool pending:1;
			
 
				-};
			
 
				-
			
 
				-struct hash {
			
 
				-	struct hlist_head *table;
			
 
				-	dm_block_t hash_bits;
			
 
				-	unsigned nr_buckets;
			
 
				-};
			
 
				-
			
 
				-struct policy {
			
 
				-	struct dm_cache_policy policy;
			
 
				-	spinlock_t lock;
			
 
				-
			
 
				-	struct list_head free;
			
 
				-	struct list_head clean;
			
 
				-	struct list_head clean_pending;
			
 
				-	struct list_head dirty;
			
 
				-
			
 
				-	/*
			
 
				-	 * We know exactly how many cblocks will be needed,
			
 
				-	 * so we can allocate them up front.
			
 
				-	 */
			
 
				-	dm_cblock_t cache_size, nr_cblocks_allocated;
			
 
				-	struct wb_cache_entry *cblocks;
			
 
				-	struct hash chash;
			
 
				-};
			
 
				-
			
 
				-/*----------------------------------------------------------------------------*/
			
 
				-
			
 
				-/*
			
 
				- * Low-level functions.
			
 
				- */
			
 
				-static unsigned next_power(unsigned n, unsigned min)
			
 
				-{
			
 
				-	return roundup_pow_of_two(max(n, min));
			
 
				-}
			
 
				-
			
 
				-static struct policy *to_policy(struct dm_cache_policy *p)
			
 
				-{
			
 
				-	return container_of(p, struct policy, policy);
			
 
				-}
			
 
				-
			
 
				-static struct list_head *list_pop(struct list_head *q)
			
 
				-{
			
 
				-	struct list_head *r = q->next;
			
 
				-
			
 
				-	list_del(r);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-/*----------------------------------------------------------------------------*/
			
 
				-
			
 
				-/* Allocate/free various resources. */
			
 
				-static int alloc_hash(struct hash *hash, unsigned elts)
			
 
				-{
			
 
				-	hash->nr_buckets = next_power(elts >> 4, 16);
			
 
				-	hash->hash_bits = __ffs(hash->nr_buckets);
			
 
				-	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
			
 
				-
			
 
				-	return hash->table ? 0 : -ENOMEM;
			
 
				-}
			
 
				-
			
 
				-static void free_hash(struct hash *hash)
			
 
				-{
			
 
				-	vfree(hash->table);
			
 
				-}
			
 
				-
			
 
				-static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
			
 
				-{
			
 
				-	int r = -ENOMEM;
			
 
				-
			
 
				-	p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
			
 
				-	if (p->cblocks) {
			
 
				-		unsigned u = from_cblock(cache_size);
			
 
				-
			
 
				-		while (u--)
			
 
				-			list_add(&p->cblocks[u].list, &p->free);
			
 
				-
			
 
				-		p->nr_cblocks_allocated = 0;
			
 
				-
			
 
				-		/* Cache entries hash. */
			
 
				-		r = alloc_hash(&p->chash, from_cblock(cache_size));
			
 
				-		if (r)
			
 
				-			vfree(p->cblocks);
			
 
				-	}
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static void free_cache_blocks_and_hash(struct policy *p)
			
 
				-{
			
 
				-	free_hash(&p->chash);
			
 
				-	vfree(p->cblocks);
			
 
				-}
			
 
				-
			
 
				-static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
			
 
				-{
			
 
				-	struct wb_cache_entry *e;
			
 
				-
			
 
				-	BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
			
 
				-
			
 
				-	e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
			
 
				-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
			
 
				-
			
 
				-	return e;
			
 
				-}
			
 
				-
			
 
				-/*----------------------------------------------------------------------------*/
			
 
				-
			
 
				-/* Hash functions (lookup, insert, remove). */
			
 
				-static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct hash *hash = &p->chash;
			
 
				-	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
			
 
				-	struct wb_cache_entry *cur;
			
 
				-	struct hlist_head *bucket = &hash->table[h];
			
 
				-
			
 
				-	hlist_for_each_entry(cur, bucket, hlist) {
			
 
				-		if (cur->oblock == oblock) {
			
 
				-			/* Move upfront bucket for faster access. */
			
 
				-			hlist_del(&cur->hlist);
			
 
				-			hlist_add_head(&cur->hlist, bucket);
			
 
				-			return cur;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return NULL;
			
 
				-}
			
 
				-
			
 
				-static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
			
 
				-{
			
 
				-	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
			
 
				-
			
 
				-	hlist_add_head(&e->hlist, &p->chash.table[h]);
			
 
				-}
			
 
				-
			
 
				-static void remove_cache_hash_entry(struct wb_cache_entry *e)
			
 
				-{
			
 
				-	hlist_del(&e->hlist);
			
 
				-}
			
 
				-
			
 
				-/* Public interface (see dm-cache-policy.h */
			
 
				-static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
			
 
				-		  bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-		  struct bio *bio, struct policy_locker *locker,
			
 
				-		  struct policy_result *result)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	result->op = POLICY_MISS;
			
 
				-
			
 
				-	if (can_block)
			
 
				-		spin_lock_irqsave(&p->lock, flags);
			
 
				-
			
 
				-	else if (!spin_trylock_irqsave(&p->lock, flags))
			
 
				-		return -EWOULDBLOCK;
			
 
				-
			
 
				-	e = lookup_cache_entry(p, oblock);
			
 
				-	if (e) {
			
 
				-		result->op = POLICY_HIT;
			
 
				-		result->cblock = e->cblock;
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	if (!spin_trylock_irqsave(&p->lock, flags))
			
 
				-		return -EWOULDBLOCK;
			
 
				-
			
 
				-	e = lookup_cache_entry(p, oblock);
			
 
				-	if (e) {
			
 
				-		*cblock = e->cblock;
			
 
				-		r = 0;
			
 
				-
			
 
				-	} else
			
 
				-		r = -ENOENT;
			
 
				-
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e;
			
 
				-
			
 
				-	e = lookup_cache_entry(p, oblock);
			
 
				-	BUG_ON(!e);
			
 
				-
			
 
				-	if (set) {
			
 
				-		if (!e->dirty) {
			
 
				-			e->dirty = true;
			
 
				-			list_move(&e->list, &p->dirty);
			
 
				-		}
			
 
				-
			
 
				-	} else {
			
 
				-		if (e->dirty) {
			
 
				-			e->pending = false;
			
 
				-			e->dirty = false;
			
 
				-			list_move(&e->list, &p->clean);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&p->lock, flags);
			
 
				-	__set_clear_dirty(pe, oblock, true);
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-}
			
 
				-
			
 
				-static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&p->lock, flags);
			
 
				-	__set_clear_dirty(pe, oblock, false);
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-}
			
 
				-
			
 
				-static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
			
 
				-{
			
 
				-	insert_cache_hash_entry(p, e);
			
 
				-	if (e->dirty)
			
 
				-		list_add(&e->list, &p->dirty);
			
 
				-	else
			
 
				-		list_add(&e->list, &p->clean);
			
 
				-}
			
 
				-
			
 
				-static int wb_load_mapping(struct dm_cache_policy *pe,
			
 
				-			   dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				-			   uint32_t hint, bool hint_valid)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e = alloc_cache_entry(p);
			
 
				-
			
 
				-	if (e) {
			
 
				-		e->cblock = cblock;
			
 
				-		e->oblock = oblock;
			
 
				-		e->dirty = false; /* blocks default to clean */
			
 
				-		add_cache_entry(p, e);
			
 
				-		r = 0;
			
 
				-
			
 
				-	} else
			
 
				-		r = -ENOMEM;
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static void wb_destroy(struct dm_cache_policy *pe)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-
			
 
				-	free_cache_blocks_and_hash(p);
			
 
				-	kfree(p);
			
 
				-}
			
 
				-
			
 
				-static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
			
 
				-
			
 
				-	BUG_ON(!r);
			
 
				-
			
 
				-	remove_cache_hash_entry(r);
			
 
				-	list_del(&r->list);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&p->lock, flags);
			
 
				-	e = __wb_force_remove_mapping(p, oblock);
			
 
				-	list_add_tail(&e->list, &p->free);
			
 
				-	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
			
 
				-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-}
			
 
				-
			
 
				-static void wb_force_mapping(struct dm_cache_policy *pe,
			
 
				-				dm_oblock_t current_oblock, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&p->lock, flags);
			
 
				-	e = __wb_force_remove_mapping(p, current_oblock);
			
 
				-	e->oblock = oblock;
			
 
				-	add_cache_entry(p, e);
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-}
			
 
				-
			
 
				-static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
			
 
				-{
			
 
				-	struct list_head *l;
			
 
				-	struct wb_cache_entry *r;
			
 
				-
			
 
				-	if (list_empty(&p->dirty))
			
 
				-		return NULL;
			
 
				-
			
 
				-	l = list_pop(&p->dirty);
			
 
				-	r = container_of(l, struct wb_cache_entry, list);
			
 
				-	list_add(l, &p->clean_pending);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static int wb_writeback_work(struct dm_cache_policy *pe,
			
 
				-			     dm_oblock_t *oblock,
			
 
				-			     dm_cblock_t *cblock,
			
 
				-			     bool critical_only)
			
 
				-{
			
 
				-	int r = -ENOENT;
			
 
				-	struct policy *p = to_policy(pe);
			
 
				-	struct wb_cache_entry *e;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&p->lock, flags);
			
 
				-
			
 
				-	e = get_next_dirty_entry(p);
			
 
				-	if (e) {
			
 
				-		*oblock = e->oblock;
			
 
				-		*cblock = e->cblock;
			
 
				-		r = 0;
			
 
				-	}
			
 
				-
			
 
				-	spin_unlock_irqrestore(&p->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
			
 
				-{
			
 
				-	return to_policy(pe)->nr_cblocks_allocated;
			
 
				-}
			
 
				-
			
 
				-/* Init the policy plugin interface function pointers. */
			
 
				-static void init_policy_functions(struct policy *p)
			
 
				-{
			
 
				-	p->policy.destroy = wb_destroy;
			
 
				-	p->policy.map = wb_map;
			
 
				-	p->policy.lookup = wb_lookup;
			
 
				-	p->policy.set_dirty = wb_set_dirty;
			
 
				-	p->policy.clear_dirty = wb_clear_dirty;
			
 
				-	p->policy.load_mapping = wb_load_mapping;
			
 
				-	p->policy.get_hint = NULL;
			
 
				-	p->policy.remove_mapping = wb_remove_mapping;
			
 
				-	p->policy.writeback_work = wb_writeback_work;
			
 
				-	p->policy.force_mapping = wb_force_mapping;
			
 
				-	p->policy.residency = wb_residency;
			
 
				-	p->policy.tick = NULL;
			
 
				-}
			
 
				-
			
 
				-static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
			
 
				-					 sector_t origin_size,
			
 
				-					 sector_t cache_block_size)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
			
 
				-
			
 
				-	if (!p)
			
 
				-		return NULL;
			
 
				-
			
 
				-	init_policy_functions(p);
			
 
				-	INIT_LIST_HEAD(&p->free);
			
 
				-	INIT_LIST_HEAD(&p->clean);
			
 
				-	INIT_LIST_HEAD(&p->clean_pending);
			
 
				-	INIT_LIST_HEAD(&p->dirty);
			
 
				-
			
 
				-	p->cache_size = cache_size;
			
 
				-	spin_lock_init(&p->lock);
			
 
				-
			
 
				-	/* Allocate cache entry structs and add them to free list. */
			
 
				-	r = alloc_cache_blocks_with_hash(p, cache_size);
			
 
				-	if (!r)
			
 
				-		return &p->policy;
			
 
				-
			
 
				-	kfree(p);
			
 
				-
			
 
				-	return NULL;
			
 
				-}
			
 
				-/*----------------------------------------------------------------------------*/
			
 
				-
			
 
				-static struct dm_cache_policy_type wb_policy_type = {
			
 
				-	.name = "cleaner",
			
 
				-	.version = {1, 0, 0},
			
 
				-	.hint_size = 4,
			
 
				-	.owner = THIS_MODULE,
			
 
				-	.create = wb_create
			
 
				-};
			
 
				-
			
 
				-static int __init wb_init(void)
			
 
				-{
			
 
				-	int r = dm_cache_policy_register(&wb_policy_type);
			
 
				-
			
 
				-	if (r < 0)
			
 
				-		DMERR("register failed %d", r);
			
 
				-	else
			
 
				-		DMINFO("version %u.%u.%u loaded",
			
 
				-		       wb_policy_type.version[0],
			
 
				-		       wb_policy_type.version[1],
			
 
				-		       wb_policy_type.version[2]);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static void __exit wb_exit(void)
			
 
				-{
			
 
				-	dm_cache_policy_unregister(&wb_policy_type);
			
 
				-}
			
 
				-
			
 
				-module_init(wb_init);
			
 
				-module_exit(wb_exit);
			
 
				-
			
 
				-MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
			
 
				-MODULE_LICENSE("GPL");
			
 
				-MODULE_DESCRIPTION("cleaner cache policy");
			
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -12,70 +12,65 @@
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-/*
			
 
				- * Little inline functions that simplify calling the policy methods.
			
 
				- */
			
 
				-static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				-			     bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-			     struct bio *bio, struct policy_locker *locker,
			
 
				-			     struct policy_result *result)
			
 
				+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+				int data_dir, bool fast_copy, bool *background_queued)
			
 
				 {
			
 
				-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
			
 
				+	return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
			
 
				 }
			
 
				 
			
 
				-static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				+static inline int policy_lookup_with_work(struct dm_cache_policy *p,
			
 
				+					  dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+					  int data_dir, bool fast_copy,
			
 
				+					  struct policy_work **work)
			
 
				 {
			
 
				-	BUG_ON(!p->lookup);
			
 
				-	return p->lookup(p, oblock, cblock);
			
 
				-}
			
 
				+	if (!p->lookup_with_work) {
			
 
				+		*work = NULL;
			
 
				+		return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
			
 
				+	}
			
 
				 
			
 
				-static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				-{
			
 
				-	if (p->set_dirty)
			
 
				-		p->set_dirty(p, oblock);
			
 
				+	return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
			
 
				 }
			
 
				 
			
 
				-static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+static inline int policy_get_background_work(struct dm_cache_policy *p,
			
 
				+					     bool idle, struct policy_work **result)
			
 
				 {
			
 
				-	if (p->clear_dirty)
			
 
				-		p->clear_dirty(p, oblock);
			
 
				+	return p->get_background_work(p, idle, result);
			
 
				 }
			
 
				 
			
 
				-static inline int policy_load_mapping(struct dm_cache_policy *p,
			
 
				-				      dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				-				      uint32_t hint, bool hint_valid)
			
 
				+static inline void policy_complete_background_work(struct dm_cache_policy *p,
			
 
				+						   struct policy_work *work,
			
 
				+						   bool success)
			
 
				 {
			
 
				-	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
			
 
				+	return p->complete_background_work(p, work, success);
			
 
				 }
			
 
				 
			
 
				-static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
			
 
				-				       dm_cblock_t cblock)
			
 
				+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				 {
			
 
				-	return p->get_hint ? p->get_hint(p, cblock) : 0;
			
 
				+	p->set_dirty(p, cblock);
			
 
				 }
			
 
				 
			
 
				-static inline int policy_writeback_work(struct dm_cache_policy *p,
			
 
				-					dm_oblock_t *oblock,
			
 
				-					dm_cblock_t *cblock,
			
 
				-					bool critical_only)
			
 
				+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				 {
			
 
				-	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
			
 
				+	p->clear_dirty(p, cblock);
			
 
				 }
			
 
				 
			
 
				-static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+static inline int policy_load_mapping(struct dm_cache_policy *p,
			
 
				+				      dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+				      bool dirty, uint32_t hint, bool hint_valid)
			
 
				 {
			
 
				-	p->remove_mapping(p, oblock);
			
 
				+	return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
			
 
				 }
			
 
				 
			
 
				-static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				+static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
			
 
				+					    dm_cblock_t cblock)
			
 
				 {
			
 
				-	return p->remove_cblock(p, cblock);
			
 
				+	return p->invalidate_mapping(p, cblock);
			
 
				 }
			
 
				 
			
 
				-static inline void policy_force_mapping(struct dm_cache_policy *p,
			
 
				-					dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
			
 
				+				       dm_cblock_t cblock)
			
 
				 {
			
 
				-	return p->force_mapping(p, current_oblock, new_oblock);
			
 
				+	return p->get_hint ? p->get_hint(p, cblock) : 0;
			
 
				 }
			
 
				 
			
 
				 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
			
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 
				 	return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
			
 
				 }
			
 
				 
			
 
				+static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
			
 
				+{
			
 
				+	return p->allow_migrations(p, allow);
			
 
				+}
			
 
				+
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 /*
			
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
 
				  * This file is released under the GPL.
			
 
				  */
			
 
				 
			
 
				-#include "dm-cache-policy.h"
			
 
				+#include "dm-cache-background-tracker.h"
			
 
				 #include "dm-cache-policy-internal.h"
			
 
				+#include "dm-cache-policy.h"
			
 
				 #include "dm.h"
			
 
				 
			
 
				 #include <linux/hash.h>
			
@@ -38,10 +39,11 @@ struct entry {
 
				 	unsigned hash_next:28;
			
 
				 	unsigned prev:28;
			
 
				 	unsigned next:28;
			
 
				-	unsigned level:7;
			
 
				+	unsigned level:6;
			
 
				 	bool dirty:1;
			
 
				 	bool allocated:1;
			
 
				 	bool sentinel:1;
			
 
				+	bool pending_work:1;
			
 
				 
			
 
				 	dm_oblock_t oblock;
			
 
				 };
			
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
 
				  */
			
 
				 static void q_push(struct queue *q, struct entry *e)
			
 
				 {
			
 
				+	BUG_ON(e->pending_work);
			
 
				+
			
 
				 	if (!e->sentinel)
			
 
				 		q->nr_elts++;
			
 
				 
			
 
				 	l_add_tail(q->es, q->qs + e->level, e);
			
 
				 }
			
 
				 
			
 
				+static void q_push_front(struct queue *q, struct entry *e)
			
 
				+{
			
 
				+	BUG_ON(e->pending_work);
			
 
				+
			
 
				+	if (!e->sentinel)
			
 
				+		q->nr_elts++;
			
 
				+
			
 
				+	l_add_head(q->es, q->qs + e->level, e);
			
 
				+}
			
 
				+
			
 
				 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
			
 
				 {
			
 
				+	BUG_ON(e->pending_work);
			
 
				+
			
 
				 	if (!e->sentinel)
			
 
				 		q->nr_elts++;
			
 
				 
			
@@ -335,19 +351,6 @@ static struct entry *q_pop(struct queue *q)
 
				 	return e;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Pops an entry from a level that is not past a sentinel.
			
 
				- */
			
 
				-static struct entry *q_pop_old(struct queue *q, unsigned max_level)
			
 
				-{
			
 
				-	struct entry *e = q_peek(q, max_level, false);
			
 
				-
			
 
				-	if (e)
			
 
				-		q_del(q, e);
			
 
				-
			
 
				-	return e;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * This function assumes there is a non-sentinel entry to pop.  It's only
			
 
				  * used by redistribute, so we know this is true.  It also doesn't adjust
			
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
 
				 				break;
			
 
				 
			
 
				 			e->level = level + 1u;
			
 
				-			l_add_head(q->es, l_above, e);
			
 
				+			l_add_tail(q->es, l_above, e);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
			
 
				+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
			
 
				+		      struct entry *s1, struct entry *s2)
			
 
				 {
			
 
				 	struct entry *de;
			
 
				-	unsigned new_level;
			
 
				-
			
 
				-	q_del(q, e);
			
 
				+	unsigned sentinels_passed = 0;
			
 
				+	unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
			
 
				 
			
 
				+	/* try and find an entry to swap with */
			
 
				 	if (extra_levels && (e->level < q->nr_levels - 1u)) {
			
 
				-		new_level = min(q->nr_levels - 1u, e->level + extra_levels);
			
 
				-		for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
			
 
				-			if (de->sentinel)
			
 
				-				continue;
			
 
				+		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
			
 
				+			sentinels_passed++;
			
 
				 
			
 
				+		if (de) {
			
 
				 			q_del(q, de);
			
 
				 			de->level = e->level;
			
 
				+			if (s1) {
			
 
				+				switch (sentinels_passed) {
			
 
				+				case 0:
			
 
				+					q_push_before(q, s1, de);
			
 
				+					break;
			
 
				+
			
 
				+				case 1:
			
 
				+					q_push_before(q, s2, de);
			
 
				+					break;
			
 
				 
			
 
				-			if (dest)
			
 
				-				q_push_before(q, dest, de);
			
 
				-			else
			
 
				+				default:
			
 
				+					q_push(q, de);
			
 
				+				}
			
 
				+			} else
			
 
				 				q_push(q, de);
			
 
				-			break;
			
 
				 		}
			
 
				-
			
 
				-		e->level = new_level;
			
 
				 	}
			
 
				 
			
 
				+	q_del(q, e);
			
 
				+	e->level = new_level;
			
 
				 	q_push(q, e);
			
 
				 }
			
 
				 
			
 
				-static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
			
 
				-{
			
 
				-	q_requeue_before(q, NULL, e, extra_levels);
			
 
				-}
			
 
				-
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 #define FP_SHIFT 8
			
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-struct hash_table {
			
 
				+struct smq_hash_table {
			
 
				 	struct entry_space *es;
			
 
				 	unsigned long long hash_bits;
			
 
				 	unsigned *buckets;
			
@@ -560,7 +567,7 @@ struct hash_table {
 
				  * All cache entries are stored in a chained hash table.  To save space we
			
 
				  * use indexing again, and only store indexes to the next entry.
			
 
				  */
			
 
				-static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
			
 
				+static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
			
 
				 {
			
 
				 	unsigned i, nr_buckets;
			
 
				 
			
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void h_exit(struct hash_table *ht)
			
 
				+static void h_exit(struct smq_hash_table *ht)
			
 
				 {
			
 
				 	vfree(ht->buckets);
			
 
				 }
			
 
				 
			
 
				-static struct entry *h_head(struct hash_table *ht, unsigned bucket)
			
 
				+static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
			
 
				 {
			
 
				 	return to_entry(ht->es, ht->buckets[bucket]);
			
 
				 }
			
 
				 
			
 
				-static struct entry *h_next(struct hash_table *ht, struct entry *e)
			
 
				+static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
			
 
				 {
			
 
				 	return to_entry(ht->es, e->hash_next);
			
 
				 }
			
 
				 
			
 
				-static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
			
 
				+static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
			
 
				 {
			
 
				 	e->hash_next = ht->buckets[bucket];
			
 
				 	ht->buckets[bucket] = to_index(ht->es, e);
			
 
				 }
			
 
				 
			
 
				-static void h_insert(struct hash_table *ht, struct entry *e)
			
 
				+static void h_insert(struct smq_hash_table *ht, struct entry *e)
			
 
				 {
			
 
				 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
			
 
				 	__h_insert(ht, h, e);
			
 
				 }
			
 
				 
			
 
				-static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
			
 
				+static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
			
 
				 				struct entry **prev)
			
 
				 {
			
 
				 	struct entry *e;
			
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-static void __h_unlink(struct hash_table *ht, unsigned h,
			
 
				+static void __h_unlink(struct smq_hash_table *ht, unsigned h,
			
 
				 		       struct entry *e, struct entry *prev)
			
 
				 {
			
 
				 	if (prev)
			
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
 
				 /*
			
 
				  * Also moves each entry to the front of the bucket.
			
 
				  */
			
 
				-static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
			
 
				+static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
			
 
				 {
			
 
				 	struct entry *e, *prev;
			
 
				 	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
			
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
 
				 	return e;
			
 
				 }
			
 
				 
			
 
				-static void h_remove(struct hash_table *ht, struct entry *e)
			
 
				+static void h_remove(struct smq_hash_table *ht, struct entry *e)
			
 
				 {
			
 
				 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
			
 
				 	struct entry *prev;
			
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
 
				 	e->next = INDEXER_NULL;
			
 
				 	e->prev = INDEXER_NULL;
			
 
				 	e->level = 0u;
			
 
				+	e->dirty = true;	/* FIXME: audit */
			
 
				 	e->allocated = true;
			
 
				+	e->sentinel = false;
			
 
				+	e->pending_work = false;
			
 
				 }
			
 
				 
			
 
				 static struct entry *alloc_entry(struct entry_alloc *ea)
			
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
 
				 #define NR_HOTSPOT_LEVELS 64u
			
 
				 #define NR_CACHE_LEVELS 64u
			
 
				 
			
 
				-#define WRITEBACK_PERIOD (10 * HZ)
			
 
				-#define DEMOTE_PERIOD (60 * HZ)
			
 
				+#define WRITEBACK_PERIOD (10ul * HZ)
			
 
				+#define DEMOTE_PERIOD (60ul * HZ)
			
 
				 
			
 
				 #define HOTSPOT_UPDATE_PERIOD (HZ)
			
 
				-#define CACHE_UPDATE_PERIOD (10u * HZ)
			
 
				+#define CACHE_UPDATE_PERIOD (60ul * HZ)
			
 
				 
			
 
				 struct smq_policy {
			
 
				 	struct dm_cache_policy policy;
			
@@ -814,8 +824,8 @@ struct smq_policy {
 
				 	 * The hash tables allows us to quickly find an entry by origin
			
 
				 	 * block.
			
 
				 	 */
			
 
				-	struct hash_table table;
			
 
				-	struct hash_table hotspot_table;
			
 
				+	struct smq_hash_table table;
			
 
				+	struct smq_hash_table hotspot_table;
			
 
				 
			
 
				 	bool current_writeback_sentinels;
			
 
				 	unsigned long next_writeback_period;
			
@@ -828,6 +838,10 @@ struct smq_policy {
 
				 
			
 
				 	unsigned long next_hotspot_period;
			
 
				 	unsigned long next_cache_period;
			
 
				+
			
 
				+	struct background_tracker *bg_work;
			
 
				+
			
 
				+	bool migrations_allowed;
			
 
				 };
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
 
				 static void update_sentinels(struct smq_policy *mq)
			
 
				 {
			
 
				 	if (time_after(jiffies, mq->next_writeback_period)) {
			
 
				-		__update_writeback_sentinels(mq);
			
 
				 		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
			
 
				 		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
			
 
				+		__update_writeback_sentinels(mq);
			
 
				 	}
			
 
				 
			
 
				 	if (time_after(jiffies, mq->next_demote_period)) {
			
 
				-		__update_demote_sentinels(mq);
			
 
				 		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
			
 
				 		mq->current_demote_sentinels = !mq->current_demote_sentinels;
			
 
				+		__update_demote_sentinels(mq);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-/*
			
 
				- * These methods tie together the dirty queue, clean queue and hash table.
			
 
				- */
			
 
				-static void push_new(struct smq_policy *mq, struct entry *e)
			
 
				+static void del_queue(struct smq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
			
 
				-	h_insert(&mq->table, e);
			
 
				-	q_push(q, e);
			
 
				+	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
			
 
				 }
			
 
				 
			
 
				-static void push(struct smq_policy *mq, struct entry *e)
			
 
				+static void push_queue(struct smq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	struct entry *sentinel;
			
 
				-
			
 
				-	h_insert(&mq->table, e);
			
 
				-
			
 
				-	/*
			
 
				-	 * Punch this into the queue just in front of the sentinel, to
			
 
				-	 * ensure it's cleaned straight away.
			
 
				-	 */
			
 
				-	if (e->dirty) {
			
 
				-		sentinel = writeback_sentinel(mq, e->level);
			
 
				-		q_push_before(&mq->dirty, sentinel, e);
			
 
				-	} else {
			
 
				-		sentinel = demote_sentinel(mq, e->level);
			
 
				-		q_push_before(&mq->clean, sentinel, e);
			
 
				-	}
			
 
				+	if (e->dirty)
			
 
				+		q_push(&mq->dirty, e);
			
 
				+	else
			
 
				+		q_push(&mq->clean, e);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Removes an entry from cache.  Removes from the hash table.
			
 
				- */
			
 
				-static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
			
 
				+// !h, !q, a -> h, q, a
			
 
				+static void push(struct smq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	q_del(q, e);
			
 
				-	h_remove(&mq->table, e);
			
 
				+	h_insert(&mq->table, e);
			
 
				+	if (!e->pending_work)
			
 
				+		push_queue(mq, e);
			
 
				 }
			
 
				 
			
 
				-static void del(struct smq_policy *mq, struct entry *e)
			
 
				+static void push_queue_front(struct smq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	__del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
			
 
				+	if (e->dirty)
			
 
				+		q_push_front(&mq->dirty, e);
			
 
				+	else
			
 
				+		q_push_front(&mq->clean, e);
			
 
				 }
			
 
				 
			
 
				-static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
			
 
				+static void push_front(struct smq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	struct entry *e = q_pop_old(q, max_level);
			
 
				-	if (e)
			
 
				-		h_remove(&mq->table, e);
			
 
				-	return e;
			
 
				+	h_insert(&mq->table, e);
			
 
				+	if (!e->pending_work)
			
 
				+		push_queue_front(mq, e);
			
 
				 }
			
 
				 
			
 
				 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
			
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
 
				 
			
 
				 static void requeue(struct smq_policy *mq, struct entry *e)
			
 
				 {
			
 
				-	struct entry *sentinel;
			
 
				+	/*
			
 
				+	 * Pending work has temporarily been taken out of the queues.
			
 
				+	 */
			
 
				+	if (e->pending_work)
			
 
				+		return;
			
 
				 
			
 
				 	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
			
 
				-		if (e->dirty) {
			
 
				-			sentinel = writeback_sentinel(mq, e->level);
			
 
				-			q_requeue_before(&mq->dirty, sentinel, e, 1u);
			
 
				-		} else {
			
 
				-			sentinel = demote_sentinel(mq, e->level);
			
 
				-			q_requeue_before(&mq->clean, sentinel, e, 1u);
			
 
				+		if (!e->dirty) {
			
 
				+			q_requeue(&mq->clean, e, 1u, NULL, NULL);
			
 
				+			return;
			
 
				 		}
			
 
				+
			
 
				+		q_requeue(&mq->dirty, e, 1u,
			
 
				+			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
			
 
				+			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
 
				 	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
			
 
				 		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
			
 
				 
			
 
				+	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
			
 
				+
			
 
				 	/*
			
 
				 	 * If the hotspot queue is performing badly then we have little
			
 
				 	 * confidence that we know which blocks to promote.  So we cut down
			
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
 
				 	}
			
 
				 
			
 
				 	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
			
 
				-	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
			
 
				+	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int demote_cblock(struct smq_policy *mq,
			
 
				-			 struct policy_locker *locker,
			
 
				-			 dm_oblock_t *oblock)
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Targets are given as a percentage.
			
 
				+ */
			
 
				+#define CLEAN_TARGET 25u
			
 
				+#define FREE_TARGET 25u
			
 
				+
			
 
				+static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
			
 
				 {
			
 
				-	struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
			
 
				-	if (!demoted)
			
 
				-		/*
			
 
				-		 * We could get a block from mq->dirty, but that
			
 
				-		 * would add extra latency to the triggering bio as it
			
 
				-		 * waits for the writeback.  Better to not promote this
			
 
				-		 * time and hope there's a clean block next time this block
			
 
				-		 * is hit.
			
 
				-		 */
			
 
				-		return -ENOSPC;
			
 
				+	return from_cblock(mq->cache_size) * p / 100u;
			
 
				+}
			
 
				+
			
 
				+static bool clean_target_met(struct smq_policy *mq, bool idle)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Cache entries may not be populated.  So we cannot rely on the
			
 
				+	 * size of the clean queue.
			
 
				+	 */
			
 
				+	unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
			
 
				 
			
 
				-	if (locker->fn(locker, demoted->oblock))
			
 
				+	if (idle)
			
 
				 		/*
			
 
				-		 * We couldn't lock this block.
			
 
				+		 * We'd like to clean everything.
			
 
				 		 */
			
 
				-		return -EBUSY;
			
 
				+		return q_size(&mq->dirty) == 0u;
			
 
				+	else
			
 
				+		return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
			
 
				+		       percent_to_target(mq, CLEAN_TARGET);
			
 
				+}
			
 
				 
			
 
				-	del(mq, demoted);
			
 
				-	*oblock = demoted->oblock;
			
 
				-	free_entry(&mq->cache_alloc, demoted);
			
 
				+static bool free_target_met(struct smq_policy *mq, bool idle)
			
 
				+{
			
 
				+	unsigned nr_free = from_cblock(mq->cache_size) -
			
 
				+			   mq->cache_alloc.nr_allocated;
			
 
				 
			
 
				-	return 0;
			
 
				+	if (idle)
			
 
				+		return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
			
 
				+		       percent_to_target(mq, FREE_TARGET);
			
 
				+	else
			
 
				+		return true;
			
 
				 }
			
 
				 
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static void mark_pending(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	BUG_ON(e->sentinel);
			
 
				+	BUG_ON(!e->allocated);
			
 
				+	BUG_ON(e->pending_work);
			
 
				+	e->pending_work = true;
			
 
				+}
			
 
				+
			
 
				+static void clear_pending(struct smq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	BUG_ON(!e->pending_work);
			
 
				+	e->pending_work = false;
			
 
				+}
			
 
				+
			
 
				+static void queue_writeback(struct smq_policy *mq)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct policy_work work;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
			
 
				+	if (e) {
			
 
				+		mark_pending(mq, e);
			
 
				+		q_del(&mq->dirty, e);
			
 
				+
			
 
				+		work.op = POLICY_WRITEBACK;
			
 
				+		work.oblock = e->oblock;
			
 
				+		work.cblock = infer_cblock(mq, e);
			
 
				+
			
 
				+		r = btracker_queue(mq->bg_work, &work, NULL);
			
 
				+		WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void queue_demotion(struct smq_policy *mq)
			
 
				+{
			
 
				+	struct policy_work work;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
			
 
				+		return;
			
 
				+
			
 
				+	e = q_peek(&mq->clean, mq->clean.nr_levels, true);
			
 
				+	if (!e) {
			
 
				+		if (!clean_target_met(mq, false))
			
 
				+			queue_writeback(mq);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	mark_pending(mq, e);
			
 
				+	q_del(&mq->clean, e);
			
 
				+
			
 
				+	work.op = POLICY_DEMOTE;
			
 
				+	work.oblock = e->oblock;
			
 
				+	work.cblock = infer_cblock(mq, e);
			
 
				+	btracker_queue(mq->bg_work, &work, NULL);
			
 
				+}
			
 
				+
			
 
				+static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
			
 
				+			    struct policy_work **workp)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+	struct policy_work work;
			
 
				+
			
 
				+	if (!mq->migrations_allowed)
			
 
				+		return;
			
 
				+
			
 
				+	if (allocator_empty(&mq->cache_alloc)) {
			
 
				+		if (!free_target_met(mq, false))
			
 
				+			queue_demotion(mq);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (btracker_promotion_already_present(mq->bg_work, oblock))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * We allocate the entry now to reserve the cblock.  If the
			
 
				+	 * background work is aborted we must remember to free it.
			
 
				+	 */
			
 
				+	e = alloc_entry(&mq->cache_alloc);
			
 
				+	BUG_ON(!e);
			
 
				+	e->pending_work = true;
			
 
				+	work.op = POLICY_PROMOTE;
			
 
				+	work.oblock = oblock;
			
 
				+	work.cblock = infer_cblock(mq, e);
			
 
				+	btracker_queue(mq->bg_work, &work, workp);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				 enum promote_result {
			
 
				 	PROMOTE_NOT,
			
 
				 	PROMOTE_TEMPORARY,
			
@@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote)
 
				 	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
			
 
				 }
			
 
				 
			
 
				-static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
			
 
				-					  bool fast_promote)
			
 
				+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
			
 
				+					  int data_dir, bool fast_promote)
			
 
				 {
			
 
				-	if (bio_data_dir(bio) == WRITE) {
			
 
				+	if (data_dir == WRITE) {
			
 
				 		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
			
 
				 			return PROMOTE_TEMPORARY;
			
 
				 
			
 
				-		else
			
 
				-			return maybe_promote(hs_e->level >= mq->write_promote_level);
			
 
				+		return maybe_promote(hs_e->level >= mq->write_promote_level);
			
 
				 	} else
			
 
				 		return maybe_promote(hs_e->level >= mq->read_promote_level);
			
 
				 }
			
 
				 
			
 
				-static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
			
 
				-			    struct policy_locker *locker,
			
 
				-			    struct policy_result *result, enum promote_result pr)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct entry *e;
			
 
				-
			
 
				-	if (allocator_empty(&mq->cache_alloc)) {
			
 
				-		result->op = POLICY_REPLACE;
			
 
				-		r = demote_cblock(mq, locker, &result->old_oblock);
			
 
				-		if (r) {
			
 
				-			result->op = POLICY_MISS;
			
 
				-			return;
			
 
				-		}
			
 
				-
			
 
				-	} else
			
 
				-		result->op = POLICY_NEW;
			
 
				-
			
 
				-	e = alloc_entry(&mq->cache_alloc);
			
 
				-	BUG_ON(!e);
			
 
				-	e->oblock = oblock;
			
 
				-
			
 
				-	if (pr == PROMOTE_TEMPORARY)
			
 
				-		push(mq, e);
			
 
				-	else
			
 
				-		push_new(mq, e);
			
 
				-
			
 
				-	result->cblock = infer_cblock(mq, e);
			
 
				-}
			
 
				-
			
 
				 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
			
 
				 {
			
 
				 	sector_t r = from_oblock(b);
			
@@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
 
				 	return to_oblock(r);
			
 
				 }
			
 
				 
			
 
				-static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
			
 
				+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
			
 
				 {
			
 
				 	unsigned hi;
			
 
				 	dm_oblock_t hb = to_hblock(mq, b);
			
@@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
 
				 		hi = get_index(&mq->hotspot_alloc, e);
			
 
				 		q_requeue(&mq->hotspot, e,
			
 
				 			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
			
 
				-			  0u : mq->hotspot_level_jump);
			
 
				+			  0u : mq->hotspot_level_jump,
			
 
				+			  NULL, NULL);
			
 
				 
			
 
				 	} else {
			
 
				 		stats_miss(&mq->hotspot_stats);
			
@@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
 
				 	return e;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Looks the oblock up in the hash table, then decides whether to put in
			
 
				- * pre_cache, or cache etc.
			
 
				- */
			
 
				-static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
			
 
				-	       bool can_migrate, bool fast_promote,
			
 
				-	       struct policy_locker *locker, struct policy_result *result)
			
 
				-{
			
 
				-	struct entry *e, *hs_e;
			
 
				-	enum promote_result pr;
			
 
				-
			
 
				-	hs_e = update_hotspot_queue(mq, oblock, bio);
			
 
				-
			
 
				-	e = h_lookup(&mq->table, oblock);
			
 
				-	if (e) {
			
 
				-		stats_level_accessed(&mq->cache_stats, e->level);
			
 
				-
			
 
				-		requeue(mq, e);
			
 
				-		result->op = POLICY_HIT;
			
 
				-		result->cblock = infer_cblock(mq, e);
			
 
				-
			
 
				-	} else {
			
 
				-		stats_miss(&mq->cache_stats);
			
 
				-
			
 
				-		pr = should_promote(mq, hs_e, bio, fast_promote);
			
 
				-		if (pr == PROMOTE_NOT)
			
 
				-			result->op = POLICY_MISS;
			
 
				-
			
 
				-		else {
			
 
				-			if (!can_migrate) {
			
 
				-				result->op = POLICY_MISS;
			
 
				-				return -EWOULDBLOCK;
			
 
				-			}
			
 
				-
			
 
				-			insert_in_cache(mq, oblock, locker, result, pr);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 /*
			
@@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p)
 
				 {
			
 
				 	struct smq_policy *mq = to_smq_policy(p);
			
 
				 
			
 
				+	btracker_destroy(mq->bg_work);
			
 
				 	h_exit(&mq->hotspot_table);
			
 
				 	h_exit(&mq->table);
			
 
				 	free_bitset(mq->hotspot_hit_bits);
			
@@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p)
 
				 	kfree(mq);
			
 
				 }
			
 
				 
			
 
				-static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				-		   bool can_block, bool can_migrate, bool fast_promote,
			
 
				-		   struct bio *bio, struct policy_locker *locker,
			
 
				-		   struct policy_result *result)
			
 
				-{
			
 
				-	int r;
			
 
				-	unsigned long flags;
			
 
				-	struct smq_policy *mq = to_smq_policy(p);
			
 
				-
			
 
				-	result->op = POLICY_MISS;
			
 
				-
			
 
				-	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
			
 
				-	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				-static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				+static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+		    int data_dir, bool fast_copy,
			
 
				+		    struct policy_work **work, bool *background_work)
			
 
				 {
			
 
				-	int r;
			
 
				-	unsigned long flags;
			
 
				-	struct smq_policy *mq = to_smq_policy(p);
			
 
				-	struct entry *e;
			
 
				+	struct entry *e, *hs_e;
			
 
				+	enum promote_result pr;
			
 
				+
			
 
				+	*background_work = false;
			
 
				 
			
 
				-	spin_lock_irqsave(&mq->lock, flags);
			
 
				 	e = h_lookup(&mq->table, oblock);
			
 
				 	if (e) {
			
 
				+		stats_level_accessed(&mq->cache_stats, e->level);
			
 
				+
			
 
				+		requeue(mq, e);
			
 
				 		*cblock = infer_cblock(mq, e);
			
 
				-		r = 0;
			
 
				-	} else
			
 
				-		r = -ENOENT;
			
 
				-	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				+		return 0;
			
 
				 
			
 
				-	return r;
			
 
				-}
			
 
				+	} else {
			
 
				+		stats_miss(&mq->cache_stats);
			
 
				 
			
 
				-static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
			
 
				-{
			
 
				-	struct entry *e;
			
 
				+		/*
			
 
				+		 * The hotspot queue only gets updated with misses.
			
 
				+		 */
			
 
				+		hs_e = update_hotspot_queue(mq, oblock);
			
 
				 
			
 
				-	e = h_lookup(&mq->table, oblock);
			
 
				-	BUG_ON(!e);
			
 
				+		pr = should_promote(mq, hs_e, data_dir, fast_copy);
			
 
				+		if (pr != PROMOTE_NOT) {
			
 
				+			queue_promotion(mq, oblock, work);
			
 
				+			*background_work = true;
			
 
				+		}
			
 
				 
			
 
				-	del(mq, e);
			
 
				-	e->dirty = set;
			
 
				-	push(mq, e);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+		      int data_dir, bool fast_copy,
			
 
				+		      bool *background_work)
			
 
				 {
			
 
				+	int r;
			
 
				 	unsigned long flags;
			
 
				 	struct smq_policy *mq = to_smq_policy(p);
			
 
				 
			
 
				 	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	__smq_set_clear_dirty(mq, oblock, true);
			
 
				+	r = __lookup(mq, oblock, cblock,
			
 
				+		     data_dir, fast_copy,
			
 
				+		     NULL, background_work);
			
 
				 	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+static int smq_lookup_with_work(struct dm_cache_policy *p,
			
 
				+				dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+				int data_dir, bool fast_copy,
			
 
				+				struct policy_work **work)
			
 
				 {
			
 
				-	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	int r;
			
 
				+	bool background_queued;
			
 
				 	unsigned long flags;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				 
			
 
				 	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	__smq_set_clear_dirty(mq, oblock, false);
			
 
				+	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
			
 
				 	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				-}
			
 
				 
			
 
				-static unsigned random_level(dm_cblock_t cblock)
			
 
				-{
			
 
				-	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static int smq_load_mapping(struct dm_cache_policy *p,
			
 
				-			    dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				-			    uint32_t hint, bool hint_valid)
			
 
				+static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
			
 
				+				   struct policy_work **result)
			
 
				 {
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				 	struct smq_policy *mq = to_smq_policy(p);
			
 
				-	struct entry *e;
			
 
				 
			
 
				-	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				-	e->oblock = oblock;
			
 
				-	e->dirty = false;	/* this gets corrected in a minute */
			
 
				-	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
			
 
				-	push(mq, e);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				+	spin_lock_irqsave(&mq->lock, flags);
			
 
				+	r = btracker_issue(mq->bg_work, result);
			
 
				+	if (r == -ENODATA) {
			
 
				+		/* find some writeback work to do */
			
 
				+		if (mq->migrations_allowed && !free_target_met(mq, idle))
			
 
				+			queue_demotion(mq);
			
 
				 
			
 
				-static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				-{
			
 
				-	struct smq_policy *mq = to_smq_policy(p);
			
 
				-	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				+		else if (!clean_target_met(mq, idle))
			
 
				+			queue_writeback(mq);
			
 
				 
			
 
				-	if (!e->allocated)
			
 
				-		return 0;
			
 
				+		r = btracker_issue(mq->bg_work, result);
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				 
			
 
				-	return e->level;
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
			
 
				-{
			
 
				-	struct entry *e;
			
 
				+/*
			
 
				+ * We need to clear any pending work flags that have been set, and in the
			
 
				+ * case of promotion free the entry for the destination cblock.
			
 
				+ */
			
 
				+static void __complete_background_work(struct smq_policy *mq,
			
 
				+				       struct policy_work *work,
			
 
				+				       bool success)
			
 
				+{
			
 
				+	struct entry *e = get_entry(&mq->cache_alloc,
			
 
				+				    from_cblock(work->cblock));
			
 
				+
			
 
				+	switch (work->op) {
			
 
				+	case POLICY_PROMOTE:
			
 
				+		// !h, !q, a
			
 
				+		clear_pending(mq, e);
			
 
				+		if (success) {
			
 
				+			e->oblock = work->oblock;
			
 
				+			push(mq, e);
			
 
				+			// h, q, a
			
 
				+		} else {
			
 
				+			free_entry(&mq->cache_alloc, e);
			
 
				+			// !h, !q, !a
			
 
				+		}
			
 
				+		break;
			
 
				 
			
 
				-	e = h_lookup(&mq->table, oblock);
			
 
				-	BUG_ON(!e);
			
 
				+	case POLICY_DEMOTE:
			
 
				+		// h, !q, a
			
 
				+		if (success) {
			
 
				+			h_remove(&mq->table, e);
			
 
				+			free_entry(&mq->cache_alloc, e);
			
 
				+			// !h, !q, !a
			
 
				+		} else {
			
 
				+			clear_pending(mq, e);
			
 
				+			push_queue(mq, e);
			
 
				+			// h, q, a
			
 
				+		}
			
 
				+		break;
			
 
				 
			
 
				-	del(mq, e);
			
 
				-	free_entry(&mq->cache_alloc, e);
			
 
				+	case POLICY_WRITEBACK:
			
 
				+		// h, !q, a
			
 
				+		clear_pending(mq, e);
			
 
				+		push_queue(mq, e);
			
 
				+		// h, q, a
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	btracker_complete(mq->bg_work, work);
			
 
				 }
			
 
				 
			
 
				-static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+static void smq_complete_background_work(struct dm_cache_policy *p,
			
 
				+					 struct policy_work *work,
			
 
				+					 bool success)
			
 
				 {
			
 
				-	struct smq_policy *mq = to_smq_policy(p);
			
 
				 	unsigned long flags;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				 
			
 
				 	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	__remove_mapping(mq, oblock);
			
 
				+	__complete_background_work(mq, work, success);
			
 
				 	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				 }
			
 
				 
			
 
				-static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
			
 
				+// in_hash(oblock) -> in_hash(oblock)
			
 
				+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
			
 
				 {
			
 
				 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				 
			
 
				-	if (!e || !e->allocated)
			
 
				-		return -ENODATA;
			
 
				-
			
 
				-	del(mq, e);
			
 
				-	free_entry(&mq->cache_alloc, e);
			
 
				-
			
 
				-	return 0;
			
 
				+	if (e->pending_work)
			
 
				+		e->dirty = set;
			
 
				+	else {
			
 
				+		del_queue(mq, e);
			
 
				+		e->dirty = set;
			
 
				+		push_queue(mq, e);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				+static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				 {
			
 
				-	int r;
			
 
				 	unsigned long flags;
			
 
				 	struct smq_policy *mq = to_smq_policy(p);
			
 
				 
			
 
				 	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	r = __remove_cblock(mq, cblock);
			
 
				+	__smq_set_clear_dirty(mq, cblock, true);
			
 
				 	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-#define CLEAN_TARGET_CRITICAL 5u /* percent */
			
 
				-
			
 
				-static bool clean_target_met(struct smq_policy *mq, bool critical)
			
 
				+static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				 {
			
 
				-	if (critical) {
			
 
				-		/*
			
 
				-		 * Cache entries may not be populated.  So we're cannot rely on the
			
 
				-		 * size of the clean queue.
			
 
				-		 */
			
 
				-		unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
			
 
				-		unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-		return nr_clean >= target;
			
 
				-	} else
			
 
				-		return !q_size(&mq->dirty);
			
 
				+	spin_lock_irqsave(&mq->lock, flags);
			
 
				+	__smq_set_clear_dirty(mq, cblock, false);
			
 
				+	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				 }
			
 
				 
			
 
				-static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
			
 
				-				dm_cblock_t *cblock, bool critical_only)
			
 
				+static unsigned random_level(dm_cblock_t cblock)
			
 
				 {
			
 
				-	struct entry *e = NULL;
			
 
				-	bool target_met = clean_target_met(mq, critical_only);
			
 
				-
			
 
				-	if (critical_only)
			
 
				-		/*
			
 
				-		 * Always try and keep the bottom level clean.
			
 
				-		 */
			
 
				-		e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
			
 
				+	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
			
 
				+}
			
 
				 
			
 
				-	else
			
 
				-		e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
			
 
				+static int smq_load_mapping(struct dm_cache_policy *p,
			
 
				+			    dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+			    bool dirty, uint32_t hint, bool hint_valid)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	struct entry *e;
			
 
				 
			
 
				-	if (!e)
			
 
				-		return -ENODATA;
			
 
				+	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				+	e->oblock = oblock;
			
 
				+	e->dirty = dirty;
			
 
				+	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
			
 
				+	e->pending_work = false;
			
 
				 
			
 
				-	*oblock = e->oblock;
			
 
				-	*cblock = infer_cblock(mq, e);
			
 
				-	e->dirty = false;
			
 
				-	push_new(mq, e);
			
 
				+	/*
			
 
				+	 * When we load mappings we push ahead of both sentinels in order to
			
 
				+	 * allow demotions and cleaning to occur immediately.
			
 
				+	 */
			
 
				+	push_front(mq, e);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
			
 
				-			      dm_cblock_t *cblock, bool critical_only)
			
 
				+static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				 {
			
 
				-	int r;
			
 
				-	unsigned long flags;
			
 
				 	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				 
			
 
				-	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	r = __smq_writeback_work(mq, oblock, cblock, critical_only);
			
 
				-	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static void __force_mapping(struct smq_policy *mq,
			
 
				-			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				-{
			
 
				-	struct entry *e = h_lookup(&mq->table, current_oblock);
			
 
				+	if (!e->allocated)
			
 
				+		return -ENODATA;
			
 
				 
			
 
				-	if (e) {
			
 
				-		del(mq, e);
			
 
				-		e->oblock = new_oblock;
			
 
				-		e->dirty = true;
			
 
				-		push(mq, e);
			
 
				-	}
			
 
				+	// FIXME: what if this block has pending background work?
			
 
				+	del_queue(mq, e);
			
 
				+	h_remove(&mq->table, e);
			
 
				+	free_entry(&mq->cache_alloc, e);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void smq_force_mapping(struct dm_cache_policy *p,
			
 
				-			      dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				 	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
			
 
				 
			
 
				-	spin_lock_irqsave(&mq->lock, flags);
			
 
				-	__force_mapping(mq, current_oblock, new_oblock);
			
 
				-	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				+	if (!e->allocated)
			
 
				+		return 0;
			
 
				+
			
 
				+	return e->level;
			
 
				 }
			
 
				 
			
 
				 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
			
@@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
 
				 	spin_unlock_irqrestore(&mq->lock, flags);
			
 
				 }
			
 
				 
			
 
				+static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
			
 
				+{
			
 
				+	struct smq_policy *mq = to_smq_policy(p);
			
 
				+	mq->migrations_allowed = allow;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * smq has no config values, but the old mq policy did.  To avoid breaking
			
 
				  * software we continue to accept these configurables for the mq policy,
			
@@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
 
				 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
			
 
				 {
			
 
				 	mq->policy.destroy = smq_destroy;
			
 
				-	mq->policy.map = smq_map;
			
 
				 	mq->policy.lookup = smq_lookup;
			
 
				+	mq->policy.lookup_with_work = smq_lookup_with_work;
			
 
				+	mq->policy.get_background_work = smq_get_background_work;
			
 
				+	mq->policy.complete_background_work = smq_complete_background_work;
			
 
				 	mq->policy.set_dirty = smq_set_dirty;
			
 
				 	mq->policy.clear_dirty = smq_clear_dirty;
			
 
				 	mq->policy.load_mapping = smq_load_mapping;
			
 
				+	mq->policy.invalidate_mapping = smq_invalidate_mapping;
			
 
				 	mq->policy.get_hint = smq_get_hint;
			
 
				-	mq->policy.remove_mapping = smq_remove_mapping;
			
 
				-	mq->policy.remove_cblock = smq_remove_cblock;
			
 
				-	mq->policy.writeback_work = smq_writeback_work;
			
 
				-	mq->policy.force_mapping = smq_force_mapping;
			
 
				 	mq->policy.residency = smq_residency;
			
 
				 	mq->policy.tick = smq_tick;
			
 
				+	mq->policy.allow_migrations = smq_allow_migrations;
			
 
				 
			
 
				 	if (mimic_mq) {
			
 
				 		mq->policy.set_config_value = mq_set_config_value;
			
@@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size,
 
				 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
			
 
				 					    sector_t origin_size,
			
 
				 					    sector_t cache_block_size,
			
 
				-					    bool mimic_mq)
			
 
				+					    bool mimic_mq,
			
 
				+					    bool migrations_allowed)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
			
@@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
 
				 	}
			
 
				 
			
 
				 	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
			
 
				-        for (i = 0; i < nr_sentinels_per_queue; i++)
			
 
				+	for (i = 0; i < nr_sentinels_per_queue; i++)
			
 
				 		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
			
 
				 
			
 
				 	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
			
 
				-        for (i = 0; i < nr_sentinels_per_queue; i++)
			
 
				+	for (i = 0; i < nr_sentinels_per_queue; i++)
			
 
				 		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
			
 
				 
			
 
				 	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
			
@@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
 
				 	mq->next_hotspot_period = jiffies;
			
 
				 	mq->next_cache_period = jiffies;
			
 
				 
			
 
				+	mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
			
 
				+	if (!mq->bg_work)
			
 
				+		goto bad_btracker;
			
 
				+
			
 
				+	mq->migrations_allowed = migrations_allowed;
			
 
				+
			
 
				 	return &mq->policy;
			
 
				 
			
 
				+bad_btracker:
			
 
				+	h_exit(&mq->hotspot_table);
			
 
				 bad_alloc_hotspot_table:
			
 
				 	h_exit(&mq->table);
			
 
				 bad_alloc_table:
			
@@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
 
				 					  sector_t origin_size,
			
 
				 					  sector_t cache_block_size)
			
 
				 {
			
 
				-	return __smq_create(cache_size, origin_size, cache_block_size, false);
			
 
				+	return __smq_create(cache_size, origin_size, cache_block_size, false, true);
			
 
				 }
			
 
				 
			
 
				 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
			
 
				 					 sector_t origin_size,
			
 
				 					 sector_t cache_block_size)
			
 
				 {
			
 
				-	return __smq_create(cache_size, origin_size, cache_block_size, true);
			
 
				+	return __smq_create(cache_size, origin_size, cache_block_size, true, true);
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
			
 
				+					      sector_t origin_size,
			
 
				+					      sector_t cache_block_size)
			
 
				+{
			
 
				+	return __smq_create(cache_size, origin_size, cache_block_size, false, false);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 static struct dm_cache_policy_type smq_policy_type = {
			
 
				 	.name = "smq",
			
 
				-	.version = {1, 5, 0},
			
 
				+	.version = {2, 0, 0},
			
 
				 	.hint_size = 4,
			
 
				 	.owner = THIS_MODULE,
			
 
				 	.create = smq_create
			
@@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = {
 
				 
			
 
				 static struct dm_cache_policy_type mq_policy_type = {
			
 
				 	.name = "mq",
			
 
				-	.version = {1, 5, 0},
			
 
				+	.version = {2, 0, 0},
			
 
				 	.hint_size = 4,
			
 
				 	.owner = THIS_MODULE,
			
 
				 	.create = mq_create,
			
 
				 };
			
 
				 
			
 
				+static struct dm_cache_policy_type cleaner_policy_type = {
			
 
				+	.name = "cleaner",
			
 
				+	.version = {2, 0, 0},
			
 
				+	.hint_size = 4,
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.create = cleaner_create,
			
 
				+};
			
 
				+
			
 
				 static struct dm_cache_policy_type default_policy_type = {
			
 
				 	.name = "default",
			
 
				-	.version = {1, 5, 0},
			
 
				+	.version = {2, 0, 0},
			
 
				 	.hint_size = 4,
			
 
				 	.owner = THIS_MODULE,
			
 
				 	.create = smq_create,
			
@@ -1785,23 +1872,36 @@ static int __init smq_init(void)
 
				 	r = dm_cache_policy_register(&mq_policy_type);
			
 
				 	if (r) {
			
 
				 		DMERR("register failed (as mq) %d", r);
			
 
				-		dm_cache_policy_unregister(&smq_policy_type);
			
 
				-		return -ENOMEM;
			
 
				+		goto out_mq;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_cache_policy_register(&cleaner_policy_type);
			
 
				+	if (r) {
			
 
				+		DMERR("register failed (as cleaner) %d", r);
			
 
				+		goto out_cleaner;
			
 
				 	}
			
 
				 
			
 
				 	r = dm_cache_policy_register(&default_policy_type);
			
 
				 	if (r) {
			
 
				 		DMERR("register failed (as default) %d", r);
			
 
				-		dm_cache_policy_unregister(&mq_policy_type);
			
 
				-		dm_cache_policy_unregister(&smq_policy_type);
			
 
				-		return -ENOMEM;
			
 
				+		goto out_default;
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
 
				+
			
 
				+out_default:
			
 
				+	dm_cache_policy_unregister(&cleaner_policy_type);
			
 
				+out_cleaner:
			
 
				+	dm_cache_policy_unregister(&mq_policy_type);
			
 
				+out_mq:
			
 
				+	dm_cache_policy_unregister(&smq_policy_type);
			
 
				+
			
 
				+	return -ENOMEM;
			
 
				 }
			
 
				 
			
 
				 static void __exit smq_exit(void)
			
 
				 {
			
 
				+	dm_cache_policy_unregister(&cleaner_policy_type);
			
 
				 	dm_cache_policy_unregister(&smq_policy_type);
			
 
				 	dm_cache_policy_unregister(&mq_policy_type);
			
 
				 	dm_cache_policy_unregister(&default_policy_type);
			
@@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy");
 
				 
			
 
				 MODULE_ALIAS("dm-cache-default");
			
 
				 MODULE_ALIAS("dm-cache-mq");
			
 
				+MODULE_ALIAS("dm-cache-cleaner");
			
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -13,183 +13,100 @@
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-/* FIXME: make it clear which methods are optional.  Get debug policy to
			
 
				- * double check this at start.
			
 
				- */
			
 
				-
			
 
				 /*
			
 
				  * The cache policy makes the important decisions about which blocks get to
			
 
				  * live on the faster cache device.
			
 
				- *
			
 
				- * When the core target has to remap a bio it calls the 'map' method of the
			
 
				- * policy.  This returns an instruction telling the core target what to do.
			
 
				- *
			
 
				- * POLICY_HIT:
			
 
				- *   That block is in the cache.  Remap to the cache and carry on.
			
 
				- *
			
 
				- * POLICY_MISS:
			
 
				- *   This block is on the origin device.  Remap and carry on.
			
 
				- *
			
 
				- * POLICY_NEW:
			
 
				- *   This block is currently on the origin device, but the policy wants to
			
 
				- *   move it.  The core should:
			
 
				- *
			
 
				- *   - hold any further io to this origin block
			
 
				- *   - copy the origin to the given cache block
			
 
				- *   - release all the held blocks
			
 
				- *   - remap the original block to the cache
			
 
				- *
			
 
				- * POLICY_REPLACE:
			
 
				- *   This block is currently on the origin device.  The policy wants to
			
 
				- *   move it to the cache, with the added complication that the destination
			
 
				- *   cache block needs a writeback first.  The core should:
			
 
				- *
			
 
				- *   - hold any further io to this origin block
			
 
				- *   - hold any further io to the origin block that's being written back
			
 
				- *   - writeback
			
 
				- *   - copy new block to cache
			
 
				- *   - release held blocks
			
 
				- *   - remap bio to cache and reissue.
			
 
				- *
			
 
				- * Should the core run into trouble while processing a POLICY_NEW or
			
 
				- * POLICY_REPLACE instruction it will roll back the policies mapping using
			
 
				- * remove_mapping() or force_mapping().  These methods must not fail.  This
			
 
				- * approach avoids having transactional semantics in the policy (ie, the
			
 
				- * core informing the policy when a migration is complete), and hence makes
			
 
				- * it easier to write new policies.
			
 
				- *
			
 
				- * In general policy methods should never block, except in the case of the
			
 
				- * map function when can_migrate is set.  So be careful to implement using
			
 
				- * bounded, preallocated memory.
			
 
				  */
			
 
				 enum policy_operation {
			
 
				-	POLICY_HIT,
			
 
				-	POLICY_MISS,
			
 
				-	POLICY_NEW,
			
 
				-	POLICY_REPLACE
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- * When issuing a POLICY_REPLACE the policy needs to make a callback to
			
 
				- * lock the block being demoted.  This doesn't need to occur during a
			
 
				- * writeback operation since the block remains in the cache.
			
 
				- */
			
 
				-struct policy_locker;
			
 
				-typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
			
 
				-
			
 
				-struct policy_locker {
			
 
				-	policy_lock_fn fn;
			
 
				+	POLICY_PROMOTE,
			
 
				+	POLICY_DEMOTE,
			
 
				+	POLICY_WRITEBACK
			
 
				 };
			
 
				 
			
 
				 /*
			
 
				  * This is the instruction passed back to the core target.
			
 
				  */
			
 
				-struct policy_result {
			
 
				+struct policy_work {
			
 
				 	enum policy_operation op;
			
 
				-	dm_oblock_t old_oblock;	/* POLICY_REPLACE */
			
 
				-	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
			
 
				+	dm_oblock_t oblock;
			
 
				+	dm_cblock_t cblock;
			
 
				 };
			
 
				 
			
 
				 /*
			
 
				- * The cache policy object.  Just a bunch of methods.  It is envisaged that
			
 
				- * this structure will be embedded in a bigger, policy specific structure
			
 
				- * (ie. use container_of()).
			
 
				+ * The cache policy object.  It is envisaged that this structure will be
			
 
				+ * embedded in a bigger, policy specific structure (ie. use container_of()).
			
 
				  */
			
 
				 struct dm_cache_policy {
			
 
				-
			
 
				-	/*
			
 
				-	 * FIXME: make it clear which methods are optional, and which may
			
 
				-	 * block.
			
 
				-	 */
			
 
				-
			
 
				 	/*
			
 
				 	 * Destroys this object.
			
 
				 	 */
			
 
				 	void (*destroy)(struct dm_cache_policy *p);
			
 
				 
			
 
				 	/*
			
 
				-	 * See large comment above.
			
 
				-	 *
			
 
				-	 * oblock      - the origin block we're interested in.
			
 
				-	 *
			
 
				-	 * can_block - indicates whether the current thread is allowed to
			
 
				-	 *             block.  -EWOULDBLOCK returned if it can't and would.
			
 
				-	 *
			
 
				-	 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
			
 
				-	 *               instructions.  If denied and the policy would have
			
 
				-	 *               returned one of these instructions it should
			
 
				-	 *               return -EWOULDBLOCK.
			
 
				+	 * Find the location of a block.
			
 
				 	 *
			
 
				-	 * discarded_oblock - indicates whether the whole origin block is
			
 
				-	 *               in a discarded state (FIXME: better to tell the
			
 
				-	 *               policy about this sooner, so it can recycle that
			
 
				-	 *               cache block if it wants.)
			
 
				-	 * bio         - the bio that triggered this call.
			
 
				-	 * result      - gets filled in with the instruction.
			
 
				+	 * Must not block.
			
 
				 	 *
			
 
				-	 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
			
 
				+	 * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
			
 
				+	 * other errors (-EWOULDBLOCK would be typical).  data_dir should be
			
 
				+	 * READ or WRITE. fast_copy should be set if migrating this block would
			
 
				+	 * be 'cheap' somehow (eg, discarded data). background_queued will be set
			
 
				+	 * if a migration has just been queued.
			
 
				 	 */
			
 
				-	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				-		   bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				-		   struct bio *bio, struct policy_locker *locker,
			
 
				-		   struct policy_result *result);
			
 
				+	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+		      int data_dir, bool fast_copy, bool *background_queued);
			
 
				 
			
 
				 	/*
			
 
				-	 * Sometimes we want to see if a block is in the cache, without
			
 
				-	 * triggering any update of stats.  (ie. it's not a real hit).
			
 
				-	 *
			
 
				-	 * Must not block.
			
 
				+	 * Sometimes the core target can optimise a migration, eg, the
			
 
				+	 * block may be discarded, or the bio may cover an entire block.
			
 
				+	 * In order to optimise it needs the migration immediately though
			
 
				+	 * so it knows to do something different with the bio.
			
 
				 	 *
			
 
				-	 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
			
 
				-	 * (-EWOULDBLOCK would be typical).
			
 
				+	 * This method is optional (policy-internal will fallback to using
			
 
				+	 * lookup).
			
 
				 	 */
			
 
				-	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
			
 
				-
			
 
				-	void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
			
 
				-	void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
			
 
				+	int (*lookup_with_work)(struct dm_cache_policy *p,
			
 
				+				dm_oblock_t oblock, dm_cblock_t *cblock,
			
 
				+				int data_dir, bool fast_copy,
			
 
				+				struct policy_work **work);
			
 
				 
			
 
				 	/*
			
 
				-	 * Called when a cache target is first created.  Used to load a
			
 
				-	 * mapping from the metadata device into the policy.
			
 
				+	 * Retrieves background work.  Returns -ENODATA when there's no
			
 
				+	 * background work.
			
 
				 	 */
			
 
				-	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				-			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
			
 
				+	int (*get_background_work)(struct dm_cache_policy *p, bool idle,
			
 
				+			           struct policy_work **result);
			
 
				 
			
 
				 	/*
			
 
				-	 * Gets the hint for a given cblock.  Called in a single threaded
			
 
				-	 * context.  So no locking required.
			
 
				+	 * You must pass in the same work pointer that you were given, not
			
 
				+	 * a copy.
			
 
				 	 */
			
 
				-	uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				+	void (*complete_background_work)(struct dm_cache_policy *p,
			
 
				+					 struct policy_work *work,
			
 
				+					 bool success);
			
 
				+
			
 
				+	void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				+	void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				 
			
 
				 	/*
			
 
				-	 * Override functions used on the error paths of the core target.
			
 
				-	 * They must succeed.
			
 
				+	 * Called when a cache target is first created.  Used to load a
			
 
				+	 * mapping from the metadata device into the policy.
			
 
				 	 */
			
 
				-	void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
			
 
				-	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
			
 
				-			      dm_oblock_t new_oblock);
			
 
				+	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				+			    dm_cblock_t cblock, bool dirty,
			
 
				+			    uint32_t hint, bool hint_valid);
			
 
				 
			
 
				 	/*
			
 
				-	 * This is called via the invalidate_cblocks message.  It is
			
 
				-	 * possible the particular cblock has already been removed due to a
			
 
				-	 * write io in passthrough mode.  In which case this should return
			
 
				-	 * -ENODATA.
			
 
				+	 * Drops the mapping, irrespective of whether it's clean or dirty.
			
 
				+	 * Returns -ENODATA if cblock is not mapped.
			
 
				 	 */
			
 
				-	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				+	int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				 
			
 
				 	/*
			
 
				-	 * Provide a dirty block to be written back by the core target.  If
			
 
				-	 * critical_only is set then the policy should only provide work if
			
 
				-	 * it urgently needs it.
			
 
				-	 *
			
 
				-	 * Returns:
			
 
				-	 *
			
 
				-	 * 0 and @cblock,@oblock: block to write back provided
			
 
				-	 *
			
 
				-	 * -ENODATA: no dirty blocks available
			
 
				+	 * Gets the hint for a given cblock.  Called in a single threaded
			
 
				+	 * context.  So no locking required.
			
 
				 	 */
			
 
				-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
			
 
				-			      bool critical_only);
			
 
				+	uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
			
 
				 
			
 
				 	/*
			
 
				 	 * How full is the cache?
			
@@ -202,6 +119,8 @@ struct dm_cache_policy {
 
				 	 * queue merging has occurred).  To stop the policy being fooled by
			
 
				 	 * these, the core target sends regular tick() calls to the policy.
			
 
				 	 * The policy should only count an entry as hit once per tick.
			
 
				+	 *
			
 
				+	 * This method is optional.
			
 
				 	 */
			
 
				 	void (*tick)(struct dm_cache_policy *p, bool can_block);
			
 
				 
			
@@ -213,6 +132,8 @@ struct dm_cache_policy {
 
				 	int (*set_config_value)(struct dm_cache_policy *p,
			
 
				 				const char *key, const char *value);
			
 
				 
			
 
				+	void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
			
 
				+
			
 
				 	/*
			
 
				 	 * Book keeping ptr for the policy register, not for general use.
			
 
				 	 */
			
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
 
				  */
			
 
				 
			
 
				 #include "dm.h"
			
 
				-#include "dm-bio-prison.h"
			
 
				+#include "dm-bio-prison-v2.h"
			
 
				 #include "dm-bio-record.h"
			
 
				 #include "dm-cache-metadata.h"
			
 
				 
			
@@ -15,6 +15,7 @@
 
				 #include <linux/init.h>
			
 
				 #include <linux/mempool.h>
			
 
				 #include <linux/module.h>
			
 
				+#include <linux/rwsem.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/vmalloc.h>
			
 
				 
			
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-#define IOT_RESOLUTION 4
			
 
				+/*
			
 
				+ * Glossary:
			
 
				+ *
			
 
				+ * oblock: index of an origin block
			
 
				+ * cblock: index of a cache block
			
 
				+ * promotion: movement of a block from origin to cache
			
 
				+ * demotion: movement of a block from cache to origin
			
 
				+ * migration: movement of a block between the origin and cache device,
			
 
				+ *	      either direction
			
 
				+ */
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				 struct io_tracker {
			
 
				 	spinlock_t lock;
			
@@ -99,18 +111,177 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 /*
			
 
				- * Glossary:
			
 
				- *
			
 
				- * oblock: index of an origin block
			
 
				- * cblock: index of a cache block
			
 
				- * promotion: movement of a block from origin to cache
			
 
				- * demotion: movement of a block from cache to origin
			
 
				- * migration: movement of a block between the origin and cache device,
			
 
				- *	      either direction
			
 
				+ * Represents a chunk of future work.  'input' allows continuations to pass
			
 
				+ * values between themselves, typically error values.
			
 
				  */
			
 
				+struct continuation {
			
 
				+	struct work_struct ws;
			
 
				+	int input;
			
 
				+};
			
 
				+
			
 
				+static inline void init_continuation(struct continuation *k,
			
 
				+				     void (*fn)(struct work_struct *))
			
 
				+{
			
 
				+	INIT_WORK(&k->ws, fn);
			
 
				+	k->input = 0;
			
 
				+}
			
 
				+
			
 
				+static inline void queue_continuation(struct workqueue_struct *wq,
			
 
				+				      struct continuation *k)
			
 
				+{
			
 
				+	queue_work(wq, &k->ws);
			
 
				+}
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				+/*
			
 
				+ * The batcher collects together pieces of work that need a particular
			
 
				+ * operation to occur before they can proceed (typically a commit).
			
 
				+ */
			
 
				+struct batcher {
			
 
				+	/*
			
 
				+	 * The operation that everyone is waiting for.
			
 
				+	 */
			
 
				+	int (*commit_op)(void *context);
			
 
				+	void *commit_context;
			
 
				+
			
 
				+	/*
			
 
				+	 * This is how bios should be issued once the commit op is complete
			
 
				+	 * (accounted_request).
			
 
				+	 */
			
 
				+	void (*issue_op)(struct bio *bio, void *context);
			
 
				+	void *issue_context;
			
 
				+
			
 
				+	/*
			
 
				+	 * Queued work gets put on here after commit.
			
 
				+	 */
			
 
				+	struct workqueue_struct *wq;
			
 
				+
			
 
				+	spinlock_t lock;
			
 
				+	struct list_head work_items;
			
 
				+	struct bio_list bios;
			
 
				+	struct work_struct commit_work;
			
 
				+
			
 
				+	bool commit_scheduled;
			
 
				+};
			
 
				+
			
 
				+static void __commit(struct work_struct *_ws)
			
 
				+{
			
 
				+	struct batcher *b = container_of(_ws, struct batcher, commit_work);
			
 
				+
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+	struct list_head work_items;
			
 
				+	struct work_struct *ws, *tmp;
			
 
				+	struct continuation *k;
			
 
				+	struct bio *bio;
			
 
				+	struct bio_list bios;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&work_items);
			
 
				+	bio_list_init(&bios);
			
 
				+
			
 
				+	/*
			
 
				+	 * We have to grab these before the commit_op to avoid a race
			
 
				+	 * condition.
			
 
				+	 */
			
 
				+	spin_lock_irqsave(&b->lock, flags);
			
 
				+	list_splice_init(&b->work_items, &work_items);
			
 
				+	bio_list_merge(&bios, &b->bios);
			
 
				+	bio_list_init(&b->bios);
			
 
				+	b->commit_scheduled = false;
			
 
				+	spin_unlock_irqrestore(&b->lock, flags);
			
 
				+
			
 
				+	r = b->commit_op(b->commit_context);
			
 
				+
			
 
				+	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
			
 
				+		k = container_of(ws, struct continuation, ws);
			
 
				+		k->input = r;
			
 
				+		INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
			
 
				+		queue_work(b->wq, ws);
			
 
				+	}
			
 
				+
			
 
				+	while ((bio = bio_list_pop(&bios))) {
			
 
				+		if (r) {
			
 
				+			bio->bi_error = r;
			
 
				+			bio_endio(bio);
			
 
				+		} else
			
 
				+			b->issue_op(bio, b->issue_context);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void batcher_init(struct batcher *b,
			
 
				+			 int (*commit_op)(void *),
			
 
				+			 void *commit_context,
			
 
				+			 void (*issue_op)(struct bio *bio, void *),
			
 
				+			 void *issue_context,
			
 
				+			 struct workqueue_struct *wq)
			
 
				+{
			
 
				+	b->commit_op = commit_op;
			
 
				+	b->commit_context = commit_context;
			
 
				+	b->issue_op = issue_op;
			
 
				+	b->issue_context = issue_context;
			
 
				+	b->wq = wq;
			
 
				+
			
 
				+	spin_lock_init(&b->lock);
			
 
				+	INIT_LIST_HEAD(&b->work_items);
			
 
				+	bio_list_init(&b->bios);
			
 
				+	INIT_WORK(&b->commit_work, __commit);
			
 
				+	b->commit_scheduled = false;
			
 
				+}
			
 
				+
			
 
				+static void async_commit(struct batcher *b)
			
 
				+{
			
 
				+	queue_work(b->wq, &b->commit_work);
			
 
				+}
			
 
				+
			
 
				+static void continue_after_commit(struct batcher *b, struct continuation *k)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	bool commit_scheduled;
			
 
				+
			
 
				+	spin_lock_irqsave(&b->lock, flags);
			
 
				+	commit_scheduled = b->commit_scheduled;
			
 
				+	list_add_tail(&k->ws.entry, &b->work_items);
			
 
				+	spin_unlock_irqrestore(&b->lock, flags);
			
 
				+
			
 
				+	if (commit_scheduled)
			
 
				+		async_commit(b);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Bios are errored if commit failed.
			
 
				+ */
			
 
				+static void issue_after_commit(struct batcher *b, struct bio *bio)
			
 
				+{
			
 
				+       unsigned long flags;
			
 
				+       bool commit_scheduled;
			
 
				+
			
 
				+       spin_lock_irqsave(&b->lock, flags);
			
 
				+       commit_scheduled = b->commit_scheduled;
			
 
				+       bio_list_add(&b->bios, bio);
			
 
				+       spin_unlock_irqrestore(&b->lock, flags);
			
 
				+
			
 
				+       if (commit_scheduled)
			
 
				+	       async_commit(b);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Call this if some urgent work is waiting for the commit to complete.
			
 
				+ */
			
 
				+static void schedule_commit(struct batcher *b)
			
 
				+{
			
 
				+	bool immediate;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&b->lock, flags);
			
 
				+	immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
			
 
				+	b->commit_scheduled = true;
			
 
				+	spin_unlock_irqrestore(&b->lock, flags);
			
 
				+
			
 
				+	if (immediate)
			
 
				+		async_commit(b);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * There are a couple of places where we let a bio run, but want to do some
			
 
				  * work before calling its endio function.  We do this by temporarily
			
@@ -189,31 +360,13 @@ struct cache_stats {
 
				 	atomic_t write_miss;
			
 
				 	atomic_t demotion;
			
 
				 	atomic_t promotion;
			
 
				+	atomic_t writeback;
			
 
				 	atomic_t copies_avoided;
			
 
				 	atomic_t cache_cell_clash;
			
 
				 	atomic_t commit_count;
			
 
				 	atomic_t discard_count;
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
			
 
				- * the one-past-the-end value.
			
 
				- */
			
 
				-struct cblock_range {
			
 
				-	dm_cblock_t begin;
			
 
				-	dm_cblock_t end;
			
 
				-};
			
 
				-
			
 
				-struct invalidation_request {
			
 
				-	struct list_head list;
			
 
				-	struct cblock_range *cblocks;
			
 
				-
			
 
				-	atomic_t complete;
			
 
				-	int err;
			
 
				-
			
 
				-	wait_queue_head_t result_wait;
			
 
				-};
			
 
				-
			
 
				 struct cache {
			
 
				 	struct dm_target *ti;
			
 
				 	struct dm_target_callbacks callbacks;
			
@@ -255,11 +408,7 @@ struct cache {
 
				 	spinlock_t lock;
			
 
				 	struct list_head deferred_cells;
			
 
				 	struct bio_list deferred_bios;
			
 
				-	struct bio_list deferred_flush_bios;
			
 
				 	struct bio_list deferred_writethrough_bios;
			
 
				-	struct list_head quiesced_migrations;
			
 
				-	struct list_head completed_migrations;
			
 
				-	struct list_head need_commit_migrations;
			
 
				 	sector_t migration_threshold;
			
 
				 	wait_queue_head_t migration_wait;
			
 
				 	atomic_t nr_allocated_migrations;
			
@@ -270,9 +419,7 @@ struct cache {
 
				 	 */
			
 
				 	atomic_t nr_io_migrations;
			
 
				 
			
 
				-	wait_queue_head_t quiescing_wait;
			
 
				-	atomic_t quiescing;
			
 
				-	atomic_t quiescing_ack;
			
 
				+	struct rw_semaphore quiesce_lock;
			
 
				 
			
 
				 	/*
			
 
				 	 * cache_size entries, dirty if set
			
@@ -296,13 +443,11 @@ struct cache {
 
				 
			
 
				 	struct dm_kcopyd_client *copier;
			
 
				 	struct workqueue_struct *wq;
			
 
				-	struct work_struct worker;
			
 
				-
			
 
				+	struct work_struct deferred_bio_worker;
			
 
				+	struct work_struct deferred_writethrough_worker;
			
 
				+	struct work_struct migration_worker;
			
 
				 	struct delayed_work waker;
			
 
				-	unsigned long last_commit_jiffies;
			
 
				-
			
 
				-	struct dm_bio_prison *prison;
			
 
				-	struct dm_deferred_set *all_io_ds;
			
 
				+	struct dm_bio_prison_v2 *prison;
			
 
				 
			
 
				 	mempool_t *migration_pool;
			
 
				 
			
@@ -330,12 +475,17 @@ struct cache {
 
				 	struct list_head invalidation_requests;
			
 
				 
			
 
				 	struct io_tracker origin_tracker;
			
 
				+
			
 
				+	struct work_struct commit_ws;
			
 
				+	struct batcher committer;
			
 
				+
			
 
				+	struct rw_semaphore background_work_lock;
			
 
				 };
			
 
				 
			
 
				 struct per_bio_data {
			
 
				 	bool tick:1;
			
 
				 	unsigned req_nr:2;
			
 
				-	struct dm_deferred_entry *all_io_entry;
			
 
				+	struct dm_bio_prison_cell_v2 *cell;
			
 
				 	struct dm_hook_info hook_info;
			
 
				 	sector_t len;
			
 
				 
			
@@ -350,55 +500,64 @@ struct per_bio_data {
 
				 };
			
 
				 
			
 
				 struct dm_cache_migration {
			
 
				-	struct list_head list;
			
 
				+	struct continuation k;
			
 
				 	struct cache *cache;
			
 
				 
			
 
				-	unsigned long start_jiffies;
			
 
				-	dm_oblock_t old_oblock;
			
 
				-	dm_oblock_t new_oblock;
			
 
				-	dm_cblock_t cblock;
			
 
				-
			
 
				-	bool err:1;
			
 
				-	bool discard:1;
			
 
				-	bool writeback:1;
			
 
				-	bool demote:1;
			
 
				-	bool promote:1;
			
 
				-	bool requeue_holder:1;
			
 
				-	bool invalidate:1;
			
 
				+	struct policy_work *op;
			
 
				+	struct bio *overwrite_bio;
			
 
				+	struct dm_bio_prison_cell_v2 *cell;
			
 
				 
			
 
				-	struct dm_bio_prison_cell *old_ocell;
			
 
				-	struct dm_bio_prison_cell *new_ocell;
			
 
				+	dm_cblock_t invalidate_cblock;
			
 
				+	dm_oblock_t invalidate_oblock;
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * Processing a bio in the worker thread may require these memory
			
 
				- * allocations.  We prealloc to avoid deadlocks (the same worker thread
			
 
				- * frees them back to the mempool).
			
 
				- */
			
 
				-struct prealloc {
			
 
				-	struct dm_cache_migration *mg;
			
 
				-	struct dm_bio_prison_cell *cell1;
			
 
				-	struct dm_bio_prison_cell *cell2;
			
 
				-};
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static bool writethrough_mode(struct cache_features *f)
			
 
				+{
			
 
				+	return f->io_mode == CM_IO_WRITETHROUGH;
			
 
				+}
			
 
				+
			
 
				+static bool writeback_mode(struct cache_features *f)
			
 
				+{
			
 
				+	return f->io_mode == CM_IO_WRITEBACK;
			
 
				+}
			
 
				+
			
 
				+static inline bool passthrough_mode(struct cache_features *f)
			
 
				+{
			
 
				+	return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static void wake_deferred_bio_worker(struct cache *cache)
			
 
				+{
			
 
				+	queue_work(cache->wq, &cache->deferred_bio_worker);
			
 
				+}
			
 
				 
			
 
				-static enum cache_metadata_mode get_cache_mode(struct cache *cache);
			
 
				+static void wake_deferred_writethrough_worker(struct cache *cache)
			
 
				+{
			
 
				+	queue_work(cache->wq, &cache->deferred_writethrough_worker);
			
 
				+}
			
 
				 
			
 
				-static void wake_worker(struct cache *cache)
			
 
				+static void wake_migration_worker(struct cache *cache)
			
 
				 {
			
 
				-	queue_work(cache->wq, &cache->worker);
			
 
				+	if (passthrough_mode(&cache->features))
			
 
				+		return;
			
 
				+
			
 
				+	queue_work(cache->wq, &cache->migration_worker);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
			
 
				+static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
			
 
				 {
			
 
				-	/* FIXME: change to use a local slab. */
			
 
				-	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
			
 
				+	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
			
 
				 }
			
 
				 
			
 
				-static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
			
 
				+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
			
 
				 {
			
 
				-	dm_bio_prison_free_cell(cache->prison, cell);
			
 
				+	dm_bio_prison_free_cell_v2(cache->prison, cell);
			
 
				 }
			
 
				 
			
 
				 static struct dm_cache_migration *alloc_migration(struct cache *cache)
			
@@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg)
 
				 	mempool_free(mg, cache->migration_pool);
			
 
				 }
			
 
				 
			
 
				-static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
			
 
				-{
			
 
				-	if (!p->mg) {
			
 
				-		p->mg = alloc_migration(cache);
			
 
				-		if (!p->mg)
			
 
				-			return -ENOMEM;
			
 
				-	}
			
 
				-
			
 
				-	if (!p->cell1) {
			
 
				-		p->cell1 = alloc_prison_cell(cache);
			
 
				-		if (!p->cell1)
			
 
				-			return -ENOMEM;
			
 
				-	}
			
 
				-
			
 
				-	if (!p->cell2) {
			
 
				-		p->cell2 = alloc_prison_cell(cache);
			
 
				-		if (!p->cell2)
			
 
				-			return -ENOMEM;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				-static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
			
 
				+static inline dm_oblock_t oblock_succ(dm_oblock_t b)
			
 
				 {
			
 
				-	if (p->cell2)
			
 
				-		free_prison_cell(cache, p->cell2);
			
 
				-
			
 
				-	if (p->cell1)
			
 
				-		free_prison_cell(cache, p->cell1);
			
 
				-
			
 
				-	if (p->mg)
			
 
				-		free_migration(p->mg);
			
 
				+	return to_oblock(from_oblock(b) + 1ull);
			
 
				 }
			
 
				 
			
 
				-static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
			
 
				+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
			
 
				 {
			
 
				-	struct dm_cache_migration *mg = p->mg;
			
 
				-
			
 
				-	BUG_ON(!mg);
			
 
				-	p->mg = NULL;
			
 
				-
			
 
				-	return mg;
			
 
				+	key->virtual = 0;
			
 
				+	key->dev = 0;
			
 
				+	key->block_begin = from_oblock(begin);
			
 
				+	key->block_end = from_oblock(end);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * You must have a cell within the prealloc struct to return.  If not this
			
 
				- * function will BUG() rather than returning NULL.
			
 
				+ * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
			
 
				+ * level 1 which prevents *both* READs and WRITEs.
			
 
				  */
			
 
				-static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
			
 
				+#define WRITE_LOCK_LEVEL 0
			
 
				+#define READ_WRITE_LOCK_LEVEL 1
			
 
				+
			
 
				+static unsigned lock_level(struct bio *bio)
			
 
				 {
			
 
				-	struct dm_bio_prison_cell *r = NULL;
			
 
				+	return bio_data_dir(bio) == WRITE ?
			
 
				+		WRITE_LOCK_LEVEL :
			
 
				+		READ_WRITE_LOCK_LEVEL;
			
 
				+}
			
 
				 
			
 
				-	if (p->cell1) {
			
 
				-		r = p->cell1;
			
 
				-		p->cell1 = NULL;
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Per bio data
			
 
				+ *--------------------------------------------------------------*/
			
 
				 
			
 
				-	} else if (p->cell2) {
			
 
				-		r = p->cell2;
			
 
				-		p->cell2 = NULL;
			
 
				-	} else
			
 
				-		BUG();
			
 
				+/*
			
 
				+ * If using writeback, leave out struct per_bio_data's writethrough fields.
			
 
				+ */
			
 
				+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
			
 
				+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
			
 
				 
			
 
				-	return r;
			
 
				+static size_t get_per_bio_data_size(struct cache *cache)
			
 
				+{
			
 
				+	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * You can't have more than two cells in a prealloc struct.  BUG() will be
			
 
				- * called if you try and overfill.
			
 
				- */
			
 
				-static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
			
 
				+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
			
 
				 {
			
 
				-	if (!p->cell2)
			
 
				-		p->cell2 = cell;
			
 
				+	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
			
 
				+	BUG_ON(!pb);
			
 
				+	return pb;
			
 
				+}
			
 
				 
			
 
				-	else if (!p->cell1)
			
 
				-		p->cell1 = cell;
			
 
				+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
			
 
				+{
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
			
 
				 
			
 
				-	else
			
 
				-		BUG();
			
 
				+	pb->tick = false;
			
 
				+	pb->req_nr = dm_bio_get_target_bio_nr(bio);
			
 
				+	pb->cell = NULL;
			
 
				+	pb->len = 0;
			
 
				+
			
 
				+	return pb;
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
			
 
				+static void defer_bio(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	key->virtual = 0;
			
 
				-	key->dev = 0;
			
 
				-	key->block_begin = from_oblock(begin);
			
 
				-	key->block_end = from_oblock(end);
			
 
				-}
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-/*
			
 
				- * The caller hands in a preallocated cell, and a free function for it.
			
 
				- * The cell will be freed if there's an error, or if it wasn't used because
			
 
				- * a cell with that key already exists.
			
 
				- */
			
 
				-typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	bio_list_add(&cache->deferred_bios, bio);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	wake_deferred_bio_worker(cache);
			
 
				+}
			
 
				 
			
 
				-static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
			
 
				-			    struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
			
 
				-			    cell_free_fn free_fn, void *free_context,
			
 
				-			    struct dm_bio_prison_cell **cell_result)
			
 
				+static void defer_bios(struct cache *cache, struct bio_list *bios)
			
 
				 {
			
 
				-	int r;
			
 
				-	struct dm_cell_key key;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-	build_key(oblock_begin, oblock_end, &key);
			
 
				-	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
			
 
				-	if (r)
			
 
				-		free_fn(free_context, cell_prealloc);
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	bio_list_merge(&cache->deferred_bios, bios);
			
 
				+	bio_list_init(bios);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				 
			
 
				-	return r;
			
 
				+	wake_deferred_bio_worker(cache);
			
 
				 }
			
 
				 
			
 
				-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
			
 
				-		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
			
 
				-		      cell_free_fn free_fn, void *free_context,
			
 
				-		      struct dm_bio_prison_cell **cell_result)
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
			
 
				 {
			
 
				+	bool r;
			
 
				+	size_t pb_size;
			
 
				+	struct per_bio_data *pb;
			
 
				+	struct dm_cell_key_v2 key;
			
 
				 	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
			
 
				-	return bio_detain_range(cache, oblock, end, bio,
			
 
				-				cell_prealloc, free_fn, free_context, cell_result);
			
 
				-}
			
 
				+	struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
			
 
				 
			
 
				-static int get_cell(struct cache *cache,
			
 
				-		    dm_oblock_t oblock,
			
 
				-		    struct prealloc *structs,
			
 
				-		    struct dm_bio_prison_cell **cell_result)
			
 
				-{
			
 
				-	int r;
			
 
				-	struct dm_cell_key key;
			
 
				-	struct dm_bio_prison_cell *cell_prealloc;
			
 
				+	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
			
 
				+	if (!cell_prealloc) {
			
 
				+		defer_bio(cache, bio);
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	build_key(oblock, end, &key);
			
 
				+	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
			
 
				+	if (!r) {
			
 
				+		/*
			
 
				+		 * Failed to get the lock.
			
 
				+		 */
			
 
				+		free_prison_cell(cache, cell_prealloc);
			
 
				+		return r;
			
 
				+	}
			
 
				 
			
 
				-	cell_prealloc = prealloc_get_cell(structs);
			
 
				+	if (cell != cell_prealloc)
			
 
				+		free_prison_cell(cache, cell_prealloc);
			
 
				 
			
 
				-	build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
			
 
				-	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
			
 
				-	if (r)
			
 
				-		prealloc_put_cell(structs, cell_prealloc);
			
 
				+	pb_size = get_per_bio_data_size(cache);
			
 
				+	pb = get_per_bio_data(bio, pb_size);
			
 
				+	pb->cell = cell;
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
 
				 	return test_bit(from_cblock(b), cache->dirty_bitset);
			
 
				 }
			
 
				 
			
 
				-static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
			
 
				+static void set_dirty(struct cache *cache, dm_cblock_t cblock)
			
 
				 {
			
 
				 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
			
 
				 		atomic_inc(&cache->nr_dirty);
			
 
				-		policy_set_dirty(cache->policy, oblock);
			
 
				+		policy_set_dirty(cache->policy, cblock);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
			
 
				+/*
			
 
				+ * These two are called when setting after migrations to force the policy
			
 
				+ * and dirty bitset to be in sync.
			
 
				+ */
			
 
				+static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
			
 
				+{
			
 
				+	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
			
 
				+		atomic_inc(&cache->nr_dirty);
			
 
				+	policy_set_dirty(cache->policy, cblock);
			
 
				+}
			
 
				+
			
 
				+static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
			
 
				 {
			
 
				 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
			
 
				-		policy_clear_dirty(cache->policy, oblock);
			
 
				 		if (atomic_dec_return(&cache->nr_dirty) == 0)
			
 
				 			dm_table_event(cache->ti->table);
			
 
				 	}
			
 
				+
			
 
				+	policy_clear_dirty(cache->policy, cblock);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 
				 				   oblocks_per_dblock(cache)));
			
 
				 }
			
 
				 
			
 
				-static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
			
 
				-{
			
 
				-	return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
			
 
				-}
			
 
				-
			
 
				 static void set_discard(struct cache *cache, dm_dblock_t b)
			
 
				 {
			
 
				 	unsigned long flags;
			
@@ -679,89 +826,12 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-/*----------------------------------------------------------------*/
			
 
				-
			
 
				-static void load_stats(struct cache *cache)
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Remapping
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static void remap_to_origin(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	struct dm_cache_statistics stats;
			
 
				-
			
 
				-	dm_cache_metadata_get_stats(cache->cmd, &stats);
			
 
				-	atomic_set(&cache->stats.read_hit, stats.read_hits);
			
 
				-	atomic_set(&cache->stats.read_miss, stats.read_misses);
			
 
				-	atomic_set(&cache->stats.write_hit, stats.write_hits);
			
 
				-	atomic_set(&cache->stats.write_miss, stats.write_misses);
			
 
				-}
			
 
				-
			
 
				-static void save_stats(struct cache *cache)
			
 
				-{
			
 
				-	struct dm_cache_statistics stats;
			
 
				-
			
 
				-	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				-		return;
			
 
				-
			
 
				-	stats.read_hits = atomic_read(&cache->stats.read_hit);
			
 
				-	stats.read_misses = atomic_read(&cache->stats.read_miss);
			
 
				-	stats.write_hits = atomic_read(&cache->stats.write_hit);
			
 
				-	stats.write_misses = atomic_read(&cache->stats.write_miss);
			
 
				-
			
 
				-	dm_cache_metadata_set_stats(cache->cmd, &stats);
			
 
				-}
			
 
				-
			
 
				-/*----------------------------------------------------------------
			
 
				- * Per bio data
			
 
				- *--------------------------------------------------------------*/
			
 
				-
			
 
				-/*
			
 
				- * If using writeback, leave out struct per_bio_data's writethrough fields.
			
 
				- */
			
 
				-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
			
 
				-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
			
 
				-
			
 
				-static bool writethrough_mode(struct cache_features *f)
			
 
				-{
			
 
				-	return f->io_mode == CM_IO_WRITETHROUGH;
			
 
				-}
			
 
				-
			
 
				-static bool writeback_mode(struct cache_features *f)
			
 
				-{
			
 
				-	return f->io_mode == CM_IO_WRITEBACK;
			
 
				-}
			
 
				-
			
 
				-static bool passthrough_mode(struct cache_features *f)
			
 
				-{
			
 
				-	return f->io_mode == CM_IO_PASSTHROUGH;
			
 
				-}
			
 
				-
			
 
				-static size_t get_per_bio_data_size(struct cache *cache)
			
 
				-{
			
 
				-	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
			
 
				-}
			
 
				-
			
 
				-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
			
 
				-{
			
 
				-	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
			
 
				-	BUG_ON(!pb);
			
 
				-	return pb;
			
 
				-}
			
 
				-
			
 
				-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
			
 
				-{
			
 
				-	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
			
 
				-
			
 
				-	pb->tick = false;
			
 
				-	pb->req_nr = dm_bio_get_target_bio_nr(bio);
			
 
				-	pb->all_io_entry = NULL;
			
 
				-	pb->len = 0;
			
 
				-
			
 
				-	return pb;
			
 
				-}
			
 
				-
			
 
				-/*----------------------------------------------------------------
			
 
				- * Remapping
			
 
				- *--------------------------------------------------------------*/
			
 
				-static void remap_to_origin(struct cache *cache, struct bio *bio)
			
 
				-{
			
 
				-	bio->bi_bdev = cache->origin_dev->bdev;
			
 
				+	bio->bi_bdev = cache->origin_dev->bdev;
			
 
				 }
			
 
				 
			
 
				 static void remap_to_cache(struct cache *cache, struct bio *bio,
			
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 
				 }
			
 
				 
			
 
				 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
			
 
				-				  dm_oblock_t oblock)
			
 
				+					  dm_oblock_t oblock)
			
 
				 {
			
 
				+	// FIXME: this is called way too much.
			
 
				 	check_if_tick_bio_needed(cache, bio);
			
 
				 	remap_to_origin(cache, bio);
			
 
				 	if (bio_data_dir(bio) == WRITE)
			
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 
				 	check_if_tick_bio_needed(cache, bio);
			
 
				 	remap_to_cache(cache, bio, cblock);
			
 
				 	if (bio_data_dir(bio) == WRITE) {
			
 
				-		set_dirty(cache, oblock, cblock);
			
 
				+		set_dirty(cache, cblock);
			
 
				 		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				 	}
			
 
				 }
			
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 
				 	return to_oblock(block_nr);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * You must increment the deferred set whilst the prison cell is held.  To
			
 
				- * encourage this, we ask for 'cell' to be passed in.
			
 
				- */
			
 
				-static void inc_ds(struct cache *cache, struct bio *bio,
			
 
				-		   struct dm_bio_prison_cell *cell)
			
 
				-{
			
 
				-	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				-
			
 
				-	BUG_ON(!cell);
			
 
				-	BUG_ON(pb->all_io_entry);
			
 
				-
			
 
				-	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
			
 
				-}
			
 
				-
			
 
				 static bool accountable_bio(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				 	return ((bio->bi_bdev == cache->origin_dev->bdev) &&
			
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
 
				 	generic_make_request(bio);
			
 
				 }
			
 
				 
			
 
				-static void issue(struct cache *cache, struct bio *bio)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	if (!op_is_flush(bio->bi_opf)) {
			
 
				-		accounted_request(cache, bio);
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Batch together any bios that trigger commits and then issue a
			
 
				-	 * single commit for them in do_worker().
			
 
				-	 */
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	cache->commit_requested = true;
			
 
				-	bio_list_add(&cache->deferred_flush_bios, bio);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-}
			
 
				-
			
 
				-static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
			
 
				+static void issue_op(struct bio *bio, void *context)
			
 
				 {
			
 
				-	inc_ds(cache, bio, cell);
			
 
				-	issue(cache, bio);
			
 
				+	struct cache *cache = context;
			
 
				+	accounted_request(cache, bio);
			
 
				 }
			
 
				 
			
 
				 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
			
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 
				 	bio_list_add(&cache->deferred_writethrough_bios, bio);
			
 
				 	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				 
			
 
				-	wake_worker(cache);
			
 
				+	wake_deferred_writethrough_worker(cache);
			
 
				 }
			
 
				 
			
 
				 static void writethrough_endio(struct bio *bio)
			
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio)
 
				 }
			
 
				 
			
 
				 /*
			
 
				+ * FIXME: send in parallel, huge latency as is.
			
 
				  * When running in writethrough mode we need to send writes to clean blocks
			
 
				  * to both the cache and origin devices.  In future we'd like to clone the
			
 
				  * bio and send them in parallel, but for now we're doing them in
			
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
 
				 	set_cache_mode(cache, CM_READ_ONLY);
			
 
				 }
			
 
				 
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static void load_stats(struct cache *cache)
			
 
				+{
			
 
				+	struct dm_cache_statistics stats;
			
 
				+
			
 
				+	dm_cache_metadata_get_stats(cache->cmd, &stats);
			
 
				+	atomic_set(&cache->stats.read_hit, stats.read_hits);
			
 
				+	atomic_set(&cache->stats.read_miss, stats.read_misses);
			
 
				+	atomic_set(&cache->stats.write_hit, stats.write_hits);
			
 
				+	atomic_set(&cache->stats.write_miss, stats.write_misses);
			
 
				+}
			
 
				+
			
 
				+static void save_stats(struct cache *cache)
			
 
				+{
			
 
				+	struct dm_cache_statistics stats;
			
 
				+
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return;
			
 
				+
			
 
				+	stats.read_hits = atomic_read(&cache->stats.read_hit);
			
 
				+	stats.read_misses = atomic_read(&cache->stats.read_miss);
			
 
				+	stats.write_hits = atomic_read(&cache->stats.write_hit);
			
 
				+	stats.write_misses = atomic_read(&cache->stats.write_miss);
			
 
				+
			
 
				+	dm_cache_metadata_set_stats(cache->cmd, &stats);
			
 
				+}
			
 
				+
			
 
				+static void update_stats(struct cache_stats *stats, enum policy_operation op)
			
 
				+{
			
 
				+	switch (op) {
			
 
				+	case POLICY_PROMOTE:
			
 
				+		atomic_inc(&stats->promotion);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_DEMOTE:
			
 
				+		atomic_inc(&stats->demotion);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_WRITEBACK:
			
 
				+		atomic_inc(&stats->writeback);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*----------------------------------------------------------------
			
 
				  * Migration processing
			
 
				  *
			
 
				  * Migration covers moving data from the origin device to the cache, or
			
 
				  * vice versa.
			
 
				  *--------------------------------------------------------------*/
			
 
				+
			
 
				 static void inc_io_migrations(struct cache *cache)
			
 
				 {
			
 
				 	atomic_inc(&cache->nr_io_migrations);
			
@@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio)
 
				 	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
			
 
				 }
			
 
				 
			
 
				-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
			
 
				-{
			
 
				-	if (discard_or_flush(cell->holder)) {
			
 
				-		/*
			
 
				-		 * We have to handle these bios individually.
			
 
				-		 */
			
 
				-		dm_cell_release(cache->prison, cell, &cache->deferred_bios);
			
 
				-		free_prison_cell(cache, cell);
			
 
				-	} else
			
 
				-		list_add_tail(&cell->user_list, &cache->deferred_cells);
			
 
				-}
			
 
				-
			
 
				-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
			
 
				+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
			
 
				+				     dm_dblock_t *b, dm_dblock_t *e)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
			
 
				-		/*
			
 
				-		 * There was no prisoner to promote to holder, the
			
 
				-		 * cell has been released.
			
 
				-		 */
			
 
				-		free_prison_cell(cache, cell);
			
 
				-		return;
			
 
				-	}
			
 
				+	sector_t sb = bio->bi_iter.bi_sector;
			
 
				+	sector_t se = bio_end_sector(bio);
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	__cell_defer(cache, cell);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
			
 
				 
			
 
				-	wake_worker(cache);
			
 
				+	if (se - sb < cache->discard_block_size)
			
 
				+		*e = *b;
			
 
				+	else
			
 
				+		*e = to_dblock(block_div(se, cache->discard_block_size));
			
 
				 }
			
 
				 
			
 
				-static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
			
 
				-{
			
 
				-	dm_cell_error(cache->prison, cell, err);
			
 
				-	free_prison_cell(cache, cell);
			
 
				-}
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				-static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
			
 
				+static void prevent_background_work(struct cache *cache)
			
 
				 {
			
 
				-	cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
			
 
				+	lockdep_off();
			
 
				+	down_write(&cache->background_work_lock);
			
 
				+	lockdep_on();
			
 
				 }
			
 
				 
			
 
				-static void free_io_migration(struct dm_cache_migration *mg)
			
 
				+static void allow_background_work(struct cache *cache)
			
 
				 {
			
 
				-	struct cache *cache = mg->cache;
			
 
				-
			
 
				-	dec_io_migrations(cache);
			
 
				-	free_migration(mg);
			
 
				-	wake_worker(cache);
			
 
				+	lockdep_off();
			
 
				+	up_write(&cache->background_work_lock);
			
 
				+	lockdep_on();
			
 
				 }
			
 
				 
			
 
				-static void migration_failure(struct dm_cache_migration *mg)
			
 
				+static bool background_work_begin(struct cache *cache)
			
 
				 {
			
 
				-	struct cache *cache = mg->cache;
			
 
				-	const char *dev_name = cache_device_name(cache);
			
 
				-
			
 
				-	if (mg->writeback) {
			
 
				-		DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
			
 
				-		set_dirty(cache, mg->old_oblock, mg->cblock);
			
 
				-		cell_defer(cache, mg->old_ocell, false);
			
 
				-
			
 
				-	} else if (mg->demote) {
			
 
				-		DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
			
 
				-		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
			
 
				+	bool r;
			
 
				 
			
 
				-		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
			
 
				-		if (mg->promote)
			
 
				-			cell_defer(cache, mg->new_ocell, true);
			
 
				-	} else {
			
 
				-		DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
			
 
				-		policy_remove_mapping(cache->policy, mg->new_oblock);
			
 
				-		cell_defer(cache, mg->new_ocell, true);
			
 
				-	}
			
 
				+	lockdep_off();
			
 
				+	r = down_read_trylock(&cache->background_work_lock);
			
 
				+	lockdep_on();
			
 
				 
			
 
				-	free_io_migration(mg);
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static void migration_success_pre_commit(struct dm_cache_migration *mg)
			
 
				+static void background_work_end(struct cache *cache)
			
 
				 {
			
 
				-	int r;
			
 
				-	unsigned long flags;
			
 
				-	struct cache *cache = mg->cache;
			
 
				-
			
 
				-	if (mg->writeback) {
			
 
				-		clear_dirty(cache, mg->old_oblock, mg->cblock);
			
 
				-		cell_defer(cache, mg->old_ocell, false);
			
 
				-		free_io_migration(mg);
			
 
				-		return;
			
 
				+	lockdep_off();
			
 
				+	up_read(&cache->background_work_lock);
			
 
				+	lockdep_on();
			
 
				+}
			
 
				 
			
 
				-	} else if (mg->demote) {
			
 
				-		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
			
 
				-		if (r) {
			
 
				-			DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
			
 
				-				    cache_device_name(cache));
			
 
				-			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
			
 
				-			policy_force_mapping(cache->policy, mg->new_oblock,
			
 
				-					     mg->old_oblock);
			
 
				-			if (mg->promote)
			
 
				-				cell_defer(cache, mg->new_ocell, true);
			
 
				-			free_io_migration(mg);
			
 
				-			return;
			
 
				-		}
			
 
				-	} else {
			
 
				-		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
			
 
				-		if (r) {
			
 
				-			DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
			
 
				-				    cache_device_name(cache));
			
 
				-			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
			
 
				-			policy_remove_mapping(cache->policy, mg->new_oblock);
			
 
				-			free_io_migration(mg);
			
 
				-			return;
			
 
				-		}
			
 
				-	}
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_add_tail(&mg->list, &cache->need_commit_migrations);
			
 
				-	cache->commit_requested = true;
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+static void quiesce(struct dm_cache_migration *mg,
			
 
				+		    void (*continuation)(struct work_struct *))
			
 
				+{
			
 
				+	init_continuation(&mg->k, continuation);
			
 
				+	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
			
 
				 }
			
 
				 
			
 
				-static void migration_success_post_commit(struct dm_cache_migration *mg)
			
 
				+static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-	struct cache *cache = mg->cache;
			
 
				-
			
 
				-	if (mg->writeback) {
			
 
				-		DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
			
 
				-			     cache_device_name(cache));
			
 
				-		return;
			
 
				-
			
 
				-	} else if (mg->demote) {
			
 
				-		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
			
 
				-
			
 
				-		if (mg->promote) {
			
 
				-			mg->demote = false;
			
 
				-
			
 
				-			spin_lock_irqsave(&cache->lock, flags);
			
 
				-			list_add_tail(&mg->list, &cache->quiesced_migrations);
			
 
				-			spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-
			
 
				-		} else {
			
 
				-			if (mg->invalidate)
			
 
				-				policy_remove_mapping(cache->policy, mg->old_oblock);
			
 
				-			free_io_migration(mg);
			
 
				-		}
			
 
				-
			
 
				-	} else {
			
 
				-		if (mg->requeue_holder) {
			
 
				-			clear_dirty(cache, mg->new_oblock, mg->cblock);
			
 
				-			cell_defer(cache, mg->new_ocell, true);
			
 
				-		} else {
			
 
				-			/*
			
 
				-			 * The block was promoted via an overwrite, so it's dirty.
			
 
				-			 */
			
 
				-			set_dirty(cache, mg->new_oblock, mg->cblock);
			
 
				-			bio_endio(mg->new_ocell->holder);
			
 
				-			cell_defer(cache, mg->new_ocell, false);
			
 
				-		}
			
 
				-		free_io_migration(mg);
			
 
				-	}
			
 
				+	struct continuation *k = container_of(ws, struct continuation, ws);
			
 
				+	return container_of(k, struct dm_cache_migration, k);
			
 
				 }
			
 
				 
			
 
				 static void copy_complete(int read_err, unsigned long write_err, void *context)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
			
 
				-	struct cache *cache = mg->cache;
			
 
				+	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
			
 
				 
			
 
				 	if (read_err || write_err)
			
 
				-		mg->err = true;
			
 
				-
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_add_tail(&mg->list, &cache->completed_migrations);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+		mg->k.input = -EIO;
			
 
				 
			
 
				-	wake_worker(cache);
			
 
				+	queue_continuation(mg->cache->wq, &mg->k);
			
 
				 }
			
 
				 
			
 
				-static void issue_copy(struct dm_cache_migration *mg)
			
 
				+static int copy(struct dm_cache_migration *mg, bool promote)
			
 
				 {
			
 
				 	int r;
			
 
				 	struct dm_io_region o_region, c_region;
			
 
				 	struct cache *cache = mg->cache;
			
 
				-	sector_t cblock = from_cblock(mg->cblock);
			
 
				 
			
 
				 	o_region.bdev = cache->origin_dev->bdev;
			
 
				+	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
			
 
				 	o_region.count = cache->sectors_per_block;
			
 
				 
			
 
				 	c_region.bdev = cache->cache_dev->bdev;
			
 
				-	c_region.sector = cblock * cache->sectors_per_block;
			
 
				+	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
			
 
				 	c_region.count = cache->sectors_per_block;
			
 
				 
			
 
				-	if (mg->writeback || mg->demote) {
			
 
				-		/* demote */
			
 
				-		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
			
 
				-		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
			
 
				-	} else {
			
 
				-		/* promote */
			
 
				-		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
			
 
				-		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
			
 
				-	}
			
 
				+	if (promote)
			
 
				+		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
			
 
				+	else
			
 
				+		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
			
 
				 
			
 
				-	if (r < 0) {
			
 
				-		DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
			
 
				-		migration_failure(mg);
			
 
				-	}
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				+
			
 
				+	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
			
 
				+		free_prison_cell(cache, pb->cell);
			
 
				+	pb->cell = NULL;
			
 
				 }
			
 
				 
			
 
				 static void overwrite_endio(struct bio *bio)
			
@@ -1282,930 +1261,752 @@ static void overwrite_endio(struct bio *bio)
 
				 	struct cache *cache = mg->cache;
			
 
				 	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				-	unsigned long flags;
			
 
				 
			
 
				 	dm_unhook_bio(&pb->hook_info, bio);
			
 
				 
			
 
				 	if (bio->bi_error)
			
 
				-		mg->err = true;
			
 
				-
			
 
				-	mg->requeue_holder = false;
			
 
				+		mg->k.input = bio->bi_error;
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_add_tail(&mg->list, &cache->completed_migrations);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-
			
 
				-	wake_worker(cache);
			
 
				+	queue_continuation(mg->cache->wq, &mg->k);
			
 
				 }
			
 
				 
			
 
				-static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
			
 
				+static void overwrite(struct dm_cache_migration *mg,
			
 
				+		      void (*continuation)(struct work_struct *))
			
 
				 {
			
 
				+	struct bio *bio = mg->overwrite_bio;
			
 
				 	size_t pb_data_size = get_per_bio_data_size(mg->cache);
			
 
				 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				 
			
 
				 	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
			
 
				-	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
			
 
				 
			
 
				 	/*
			
 
				-	 * No need to inc_ds() here, since the cell will be held for the
			
 
				-	 * duration of the io.
			
 
				+	 * The overwrite bio is part of the copy operation, as such it does
			
 
				+	 * not set/clear discard or dirty flags.
			
 
				 	 */
			
 
				+	if (mg->op->op == POLICY_PROMOTE)
			
 
				+		remap_to_cache(mg->cache, bio, mg->op->cblock);
			
 
				+	else
			
 
				+		remap_to_origin(mg->cache, bio);
			
 
				+
			
 
				+	init_continuation(&mg->k, continuation);
			
 
				 	accounted_request(mg->cache, bio);
			
 
				 }
			
 
				 
			
 
				-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
			
 
				+/*
			
 
				+ * Migration steps:
			
 
				+ *
			
 
				+ * 1) exclusive lock preventing WRITEs
			
 
				+ * 2) quiesce
			
 
				+ * 3) copy or issue overwrite bio
			
 
				+ * 4) upgrade to exclusive lock preventing READs and WRITEs
			
 
				+ * 5) quiesce
			
 
				+ * 6) update metadata and commit
			
 
				+ * 7) unlock
			
 
				+ */
			
 
				+static void mg_complete(struct dm_cache_migration *mg, bool success)
			
 
				 {
			
 
				-	return (bio_data_dir(bio) == WRITE) &&
			
 
				-		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
			
 
				+	struct bio_list bios;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+	struct policy_work *op = mg->op;
			
 
				+	dm_cblock_t cblock = op->cblock;
			
 
				+
			
 
				+	if (success)
			
 
				+		update_stats(&cache->stats, op->op);
			
 
				+
			
 
				+	switch (op->op) {
			
 
				+	case POLICY_PROMOTE:
			
 
				+		clear_discard(cache, oblock_to_dblock(cache, op->oblock));
			
 
				+		policy_complete_background_work(cache->policy, op, success);
			
 
				+
			
 
				+		if (mg->overwrite_bio) {
			
 
				+			if (success)
			
 
				+				force_set_dirty(cache, cblock);
			
 
				+			else
			
 
				+				mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
			
 
				+			bio_endio(mg->overwrite_bio);
			
 
				+		} else {
			
 
				+			if (success)
			
 
				+				force_clear_dirty(cache, cblock);
			
 
				+			dec_io_migrations(cache);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_DEMOTE:
			
 
				+		/*
			
 
				+		 * We clear dirty here to update the nr_dirty counter.
			
 
				+		 */
			
 
				+		if (success)
			
 
				+			force_clear_dirty(cache, cblock);
			
 
				+		policy_complete_background_work(cache->policy, op, success);
			
 
				+		dec_io_migrations(cache);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_WRITEBACK:
			
 
				+		if (success)
			
 
				+			force_clear_dirty(cache, cblock);
			
 
				+		policy_complete_background_work(cache->policy, op, success);
			
 
				+		dec_io_migrations(cache);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	bio_list_init(&bios);
			
 
				+	if (mg->cell) {
			
 
				+		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
			
 
				+			free_prison_cell(cache, mg->cell);
			
 
				+	}
			
 
				+
			
 
				+	free_migration(mg);
			
 
				+	defer_bios(cache, &bios);
			
 
				+	wake_migration_worker(cache);
			
 
				+
			
 
				+	background_work_end(cache);
			
 
				 }
			
 
				 
			
 
				-static void avoid_copy(struct dm_cache_migration *mg)
			
 
				+static void mg_success(struct work_struct *ws)
			
 
				 {
			
 
				-	atomic_inc(&mg->cache->stats.copies_avoided);
			
 
				-	migration_success_pre_commit(mg);
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				+	mg_complete(mg, mg->k.input == 0);
			
 
				 }
			
 
				 
			
 
				-static void calc_discard_block_range(struct cache *cache, struct bio *bio,
			
 
				-				     dm_dblock_t *b, dm_dblock_t *e)
			
 
				+static void mg_update_metadata(struct work_struct *ws)
			
 
				 {
			
 
				-	sector_t sb = bio->bi_iter.bi_sector;
			
 
				-	sector_t se = bio_end_sector(bio);
			
 
				-
			
 
				-	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
			
 
				-
			
 
				-	if (se - sb < cache->discard_block_size)
			
 
				-		*e = *b;
			
 
				-	else
			
 
				-		*e = to_dblock(block_div(se, cache->discard_block_size));
			
 
				-}
			
 
				-
			
 
				-static void issue_discard(struct dm_cache_migration *mg)
			
 
				-{
			
 
				-	dm_dblock_t b, e;
			
 
				-	struct bio *bio = mg->new_ocell->holder;
			
 
				-	struct cache *cache = mg->cache;
			
 
				-
			
 
				-	calc_discard_block_range(cache, bio, &b, &e);
			
 
				-	while (b != e) {
			
 
				-		set_discard(cache, b);
			
 
				-		b = to_dblock(from_dblock(b) + 1);
			
 
				-	}
			
 
				-
			
 
				-	bio_endio(bio);
			
 
				-	cell_defer(cache, mg->new_ocell, false);
			
 
				-	free_migration(mg);
			
 
				-	wake_worker(cache);
			
 
				-}
			
 
				-
			
 
				-static void issue_copy_or_discard(struct dm_cache_migration *mg)
			
 
				-{
			
 
				-	bool avoid;
			
 
				+	int r;
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				 	struct cache *cache = mg->cache;
			
 
				+	struct policy_work *op = mg->op;
			
 
				 
			
 
				-	if (mg->discard) {
			
 
				-		issue_discard(mg);
			
 
				-		return;
			
 
				-	}
			
 
				+	switch (op->op) {
			
 
				+	case POLICY_PROMOTE:
			
 
				+		r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
			
 
				+				    cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
			
 
				 
			
 
				-	if (mg->writeback || mg->demote)
			
 
				-		avoid = !is_dirty(cache, mg->cblock) ||
			
 
				-			is_discarded_oblock(cache, mg->old_oblock);
			
 
				-	else {
			
 
				-		struct bio *bio = mg->new_ocell->holder;
			
 
				+			mg_complete(mg, false);
			
 
				+			return;
			
 
				+		}
			
 
				+		mg_complete(mg, true);
			
 
				+		break;
			
 
				 
			
 
				-		avoid = is_discarded_oblock(cache, mg->new_oblock);
			
 
				+	case POLICY_DEMOTE:
			
 
				+		r = dm_cache_remove_mapping(cache->cmd, op->cblock);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
			
 
				+				    cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
			
 
				 
			
 
				-		if (writeback_mode(&cache->features) &&
			
 
				-		    !avoid && bio_writes_complete_block(cache, bio)) {
			
 
				-			issue_overwrite(mg, bio);
			
 
				+			mg_complete(mg, false);
			
 
				 			return;
			
 
				 		}
			
 
				-	}
			
 
				 
			
 
				-	avoid ? avoid_copy(mg) : issue_copy(mg);
			
 
				+		/*
			
 
				+		 * It would be nice if we only had to commit when a REQ_FLUSH
			
 
				+		 * comes through.  But there's one scenario that we have to
			
 
				+		 * look out for:
			
 
				+		 *
			
 
				+		 * - vblock x in a cache block
			
 
				+		 * - domotion occurs
			
 
				+		 * - cache block gets reallocated and over written
			
 
				+		 * - crash
			
 
				+		 *
			
 
				+		 * When we recover, because there was no commit the cache will
			
 
				+		 * rollback to having the data for vblock x in the cache block.
			
 
				+		 * But the cache block has since been overwritten, so it'll end
			
 
				+		 * up pointing to data that was never in 'x' during the history
			
 
				+		 * of the device.
			
 
				+		 *
			
 
				+		 * To avoid this issue we require a commit as part of the
			
 
				+		 * demotion operation.
			
 
				+		 */
			
 
				+		init_continuation(&mg->k, mg_success);
			
 
				+		continue_after_commit(&cache->committer, &mg->k);
			
 
				+		schedule_commit(&cache->committer);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_WRITEBACK:
			
 
				+		mg_complete(mg, true);
			
 
				+		break;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void complete_migration(struct dm_cache_migration *mg)
			
 
				+static void mg_update_metadata_after_copy(struct work_struct *ws)
			
 
				 {
			
 
				-	if (mg->err)
			
 
				-		migration_failure(mg);
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				+
			
 
				+	/*
			
 
				+	 * Did the copy succeed?
			
 
				+	 */
			
 
				+	if (mg->k.input)
			
 
				+		mg_complete(mg, false);
			
 
				 	else
			
 
				-		migration_success_pre_commit(mg);
			
 
				+		mg_update_metadata(ws);
			
 
				 }
			
 
				 
			
 
				-static void process_migrations(struct cache *cache, struct list_head *head,
			
 
				-			       void (*fn)(struct dm_cache_migration *))
			
 
				+static void mg_upgrade_lock(struct work_struct *ws)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-	struct list_head list;
			
 
				-	struct dm_cache_migration *mg, *tmp;
			
 
				+	int r;
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				 
			
 
				-	INIT_LIST_HEAD(&list);
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_splice_init(head, &list);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	/*
			
 
				+	 * Did the copy succeed?
			
 
				+	 */
			
 
				+	if (mg->k.input)
			
 
				+		mg_complete(mg, false);
			
 
				 
			
 
				-	list_for_each_entry_safe(mg, tmp, &list, list)
			
 
				-		fn(mg);
			
 
				-}
			
 
				+	else {
			
 
				+		/*
			
 
				+		 * Now we want the lock to prevent both reads and writes.
			
 
				+		 */
			
 
				+		r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
			
 
				+					    READ_WRITE_LOCK_LEVEL);
			
 
				+		if (r < 0)
			
 
				+			mg_complete(mg, false);
			
 
				 
			
 
				-static void __queue_quiesced_migration(struct dm_cache_migration *mg)
			
 
				-{
			
 
				-	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
			
 
				+		else if (r)
			
 
				+			quiesce(mg, mg_update_metadata);
			
 
				+
			
 
				+		else
			
 
				+			mg_update_metadata(ws);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void queue_quiesced_migration(struct dm_cache_migration *mg)
			
 
				+static void mg_copy(struct work_struct *ws)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-	struct cache *cache = mg->cache;
			
 
				+	int r;
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	__queue_quiesced_migration(mg);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	if (mg->overwrite_bio) {
			
 
				+		/*
			
 
				+		 * It's safe to do this here, even though it's new data
			
 
				+		 * because all IO has been locked out of the block.
			
 
				+		 *
			
 
				+		 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
			
 
				+		 * so _not_ using mg_upgrade_lock() as continutation.
			
 
				+		 */
			
 
				+		overwrite(mg, mg_update_metadata_after_copy);
			
 
				 
			
 
				-	wake_worker(cache);
			
 
				-}
			
 
				+	} else {
			
 
				+		struct cache *cache = mg->cache;
			
 
				+		struct policy_work *op = mg->op;
			
 
				+		bool is_policy_promote = (op->op == POLICY_PROMOTE);
			
 
				 
			
 
				-static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-	struct dm_cache_migration *mg, *tmp;
			
 
				+		if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
			
 
				+		    is_discarded_oblock(cache, op->oblock)) {
			
 
				+			mg_upgrade_lock(ws);
			
 
				+			return;
			
 
				+		}
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_for_each_entry_safe(mg, tmp, work, list)
			
 
				-		__queue_quiesced_migration(mg);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+		init_continuation(&mg->k, mg_upgrade_lock);
			
 
				 
			
 
				-	wake_worker(cache);
			
 
				+		r = copy(mg, is_policy_promote);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
			
 
				+			mg->k.input = -EIO;
			
 
				+			mg_complete(mg, false);
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void check_for_quiesced_migrations(struct cache *cache,
			
 
				-					  struct per_bio_data *pb)
			
 
				+static int mg_lock_writes(struct dm_cache_migration *mg)
			
 
				 {
			
 
				-	struct list_head work;
			
 
				-
			
 
				-	if (!pb->all_io_entry)
			
 
				-		return;
			
 
				-
			
 
				-	INIT_LIST_HEAD(&work);
			
 
				-	dm_deferred_entry_dec(pb->all_io_entry, &work);
			
 
				+	int r;
			
 
				+	struct dm_cell_key_v2 key;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+	struct dm_bio_prison_cell_v2 *prealloc;
			
 
				 
			
 
				-	if (!list_empty(&work))
			
 
				-		queue_quiesced_migrations(cache, &work);
			
 
				-}
			
 
				+	prealloc = alloc_prison_cell(cache);
			
 
				+	if (!prealloc) {
			
 
				+		DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
			
 
				+		mg_complete(mg, false);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				 
			
 
				-static void quiesce_migration(struct dm_cache_migration *mg)
			
 
				-{
			
 
				-	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
			
 
				-		queue_quiesced_migration(mg);
			
 
				-}
			
 
				+	/*
			
 
				+	 * Prevent writes to the block, but allow reads to continue.
			
 
				+	 * Unless we're using an overwrite bio, in which case we lock
			
 
				+	 * everything.
			
 
				+	 */
			
 
				+	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
			
 
				+	r = dm_cell_lock_v2(cache->prison, &key,
			
 
				+			    mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
			
 
				+			    prealloc, &mg->cell);
			
 
				+	if (r < 0) {
			
 
				+		free_prison_cell(cache, prealloc);
			
 
				+		mg_complete(mg, false);
			
 
				+		return r;
			
 
				+	}
			
 
				 
			
 
				-static void promote(struct cache *cache, struct prealloc *structs,
			
 
				-		    dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				-		    struct dm_bio_prison_cell *cell)
			
 
				-{
			
 
				-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				+	if (mg->cell != prealloc)
			
 
				+		free_prison_cell(cache, prealloc);
			
 
				 
			
 
				-	mg->err = false;
			
 
				-	mg->discard = false;
			
 
				-	mg->writeback = false;
			
 
				-	mg->demote = false;
			
 
				-	mg->promote = true;
			
 
				-	mg->requeue_holder = true;
			
 
				-	mg->invalidate = false;
			
 
				-	mg->cache = cache;
			
 
				-	mg->new_oblock = oblock;
			
 
				-	mg->cblock = cblock;
			
 
				-	mg->old_ocell = NULL;
			
 
				-	mg->new_ocell = cell;
			
 
				-	mg->start_jiffies = jiffies;
			
 
				+	if (r == 0)
			
 
				+		mg_copy(&mg->k.ws);
			
 
				+	else
			
 
				+		quiesce(mg, mg_copy);
			
 
				 
			
 
				-	inc_io_migrations(cache);
			
 
				-	quiesce_migration(mg);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void writeback(struct cache *cache, struct prealloc *structs,
			
 
				-		      dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				-		      struct dm_bio_prison_cell *cell)
			
 
				+static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
			
 
				 {
			
 
				-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				-
			
 
				-	mg->err = false;
			
 
				-	mg->discard = false;
			
 
				-	mg->writeback = true;
			
 
				-	mg->demote = false;
			
 
				-	mg->promote = false;
			
 
				-	mg->requeue_holder = true;
			
 
				-	mg->invalidate = false;
			
 
				-	mg->cache = cache;
			
 
				-	mg->old_oblock = oblock;
			
 
				-	mg->cblock = cblock;
			
 
				-	mg->old_ocell = cell;
			
 
				-	mg->new_ocell = NULL;
			
 
				-	mg->start_jiffies = jiffies;
			
 
				-
			
 
				-	inc_io_migrations(cache);
			
 
				-	quiesce_migration(mg);
			
 
				-}
			
 
				-
			
 
				-static void demote_then_promote(struct cache *cache, struct prealloc *structs,
			
 
				-				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
			
 
				-				dm_cblock_t cblock,
			
 
				-				struct dm_bio_prison_cell *old_ocell,
			
 
				-				struct dm_bio_prison_cell *new_ocell)
			
 
				-{
			
 
				-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				-
			
 
				-	mg->err = false;
			
 
				-	mg->discard = false;
			
 
				-	mg->writeback = false;
			
 
				-	mg->demote = true;
			
 
				-	mg->promote = true;
			
 
				-	mg->requeue_holder = true;
			
 
				-	mg->invalidate = false;
			
 
				-	mg->cache = cache;
			
 
				-	mg->old_oblock = old_oblock;
			
 
				-	mg->new_oblock = new_oblock;
			
 
				-	mg->cblock = cblock;
			
 
				-	mg->old_ocell = old_ocell;
			
 
				-	mg->new_ocell = new_ocell;
			
 
				-	mg->start_jiffies = jiffies;
			
 
				-
			
 
				-	inc_io_migrations(cache);
			
 
				-	quiesce_migration(mg);
			
 
				-}
			
 
				+	struct dm_cache_migration *mg;
			
 
				 
			
 
				-/*
			
 
				- * Invalidate a cache entry.  No writeback occurs; any changes in the cache
			
 
				- * block are thrown away.
			
 
				- */
			
 
				-static void invalidate(struct cache *cache, struct prealloc *structs,
			
 
				-		       dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				-		       struct dm_bio_prison_cell *cell)
			
 
				-{
			
 
				-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				-
			
 
				-	mg->err = false;
			
 
				-	mg->discard = false;
			
 
				-	mg->writeback = false;
			
 
				-	mg->demote = true;
			
 
				-	mg->promote = false;
			
 
				-	mg->requeue_holder = true;
			
 
				-	mg->invalidate = true;
			
 
				-	mg->cache = cache;
			
 
				-	mg->old_oblock = oblock;
			
 
				-	mg->cblock = cblock;
			
 
				-	mg->old_ocell = cell;
			
 
				-	mg->new_ocell = NULL;
			
 
				-	mg->start_jiffies = jiffies;
			
 
				+	if (!background_work_begin(cache)) {
			
 
				+		policy_complete_background_work(cache->policy, op, false);
			
 
				+		return -EPERM;
			
 
				+	}
			
 
				 
			
 
				-	inc_io_migrations(cache);
			
 
				-	quiesce_migration(mg);
			
 
				-}
			
 
				+	mg = alloc_migration(cache);
			
 
				+	if (!mg) {
			
 
				+		policy_complete_background_work(cache->policy, op, false);
			
 
				+		background_work_end(cache);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				 
			
 
				-static void discard(struct cache *cache, struct prealloc *structs,
			
 
				-		    struct dm_bio_prison_cell *cell)
			
 
				-{
			
 
				-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				+	memset(mg, 0, sizeof(*mg));
			
 
				 
			
 
				-	mg->err = false;
			
 
				-	mg->discard = true;
			
 
				-	mg->writeback = false;
			
 
				-	mg->demote = false;
			
 
				-	mg->promote = false;
			
 
				-	mg->requeue_holder = false;
			
 
				-	mg->invalidate = false;
			
 
				 	mg->cache = cache;
			
 
				-	mg->old_ocell = NULL;
			
 
				-	mg->new_ocell = cell;
			
 
				-	mg->start_jiffies = jiffies;
			
 
				+	mg->op = op;
			
 
				+	mg->overwrite_bio = bio;
			
 
				+
			
 
				+	if (!bio)
			
 
				+		inc_io_migrations(cache);
			
 
				 
			
 
				-	quiesce_migration(mg);
			
 
				+	return mg_lock_writes(mg);
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------
			
 
				- * bio processing
			
 
				+ * invalidation processing
			
 
				  *--------------------------------------------------------------*/
			
 
				-static void defer_bio(struct cache *cache, struct bio *bio)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	bio_list_add(&cache->deferred_bios, bio);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-
			
 
				-	wake_worker(cache);
			
 
				-}
			
 
				-
			
 
				-static void process_flush_bio(struct cache *cache, struct bio *bio)
			
 
				-{
			
 
				-	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				-
			
 
				-	BUG_ON(bio->bi_iter.bi_size);
			
 
				-	if (!pb->req_nr)
			
 
				-		remap_to_origin(cache, bio);
			
 
				-	else
			
 
				-		remap_to_cache(cache, bio, 0);
			
 
				 
			
 
				-	/*
			
 
				-	 * REQ_PREFLUSH is not directed at any particular block so we don't
			
 
				-	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_PREFLUSH
			
 
				-	 * by dm-core.
			
 
				-	 */
			
 
				-	issue(cache, bio);
			
 
				-}
			
 
				-
			
 
				-static void process_discard_bio(struct cache *cache, struct prealloc *structs,
			
 
				-				struct bio *bio)
			
 
				+static void invalidate_complete(struct dm_cache_migration *mg, bool success)
			
 
				 {
			
 
				-	int r;
			
 
				-	dm_dblock_t b, e;
			
 
				-	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
			
 
				-
			
 
				-	calc_discard_block_range(cache, bio, &b, &e);
			
 
				-	if (b == e) {
			
 
				-		bio_endio(bio);
			
 
				-		return;
			
 
				-	}
			
 
				+	struct bio_list bios;
			
 
				+	struct cache *cache = mg->cache;
			
 
				 
			
 
				-	cell_prealloc = prealloc_get_cell(structs);
			
 
				-	r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
			
 
				-			     (cell_free_fn) prealloc_put_cell,
			
 
				-			     structs, &new_ocell);
			
 
				-	if (r > 0)
			
 
				-		return;
			
 
				+	bio_list_init(&bios);
			
 
				+	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
			
 
				+		free_prison_cell(cache, mg->cell);
			
 
				 
			
 
				-	discard(cache, structs, new_ocell);
			
 
				-}
			
 
				+	if (!success && mg->overwrite_bio)
			
 
				+		bio_io_error(mg->overwrite_bio);
			
 
				 
			
 
				-static bool spare_migration_bandwidth(struct cache *cache)
			
 
				-{
			
 
				-	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
			
 
				-		cache->sectors_per_block;
			
 
				-	return current_volume < cache->migration_threshold;
			
 
				-}
			
 
				+	free_migration(mg);
			
 
				+	defer_bios(cache, &bios);
			
 
				 
			
 
				-static void inc_hit_counter(struct cache *cache, struct bio *bio)
			
 
				-{
			
 
				-	atomic_inc(bio_data_dir(bio) == READ ?
			
 
				-		   &cache->stats.read_hit : &cache->stats.write_hit);
			
 
				+	background_work_end(cache);
			
 
				 }
			
 
				 
			
 
				-static void inc_miss_counter(struct cache *cache, struct bio *bio)
			
 
				+static void invalidate_completed(struct work_struct *ws)
			
 
				 {
			
 
				-	atomic_inc(bio_data_dir(bio) == READ ?
			
 
				-		   &cache->stats.read_miss : &cache->stats.write_miss);
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				+	invalidate_complete(mg, !mg->k.input);
			
 
				 }
			
 
				 
			
 
				-/*----------------------------------------------------------------*/
			
 
				-
			
 
				-struct inc_detail {
			
 
				-	struct cache *cache;
			
 
				-	struct bio_list bios_for_issue;
			
 
				-	struct bio_list unhandled_bios;
			
 
				-	bool any_writes;
			
 
				-};
			
 
				-
			
 
				-static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
			
 
				+static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
			
 
				 {
			
 
				-	struct bio *bio;
			
 
				-	struct inc_detail *detail = context;
			
 
				-	struct cache *cache = detail->cache;
			
 
				-
			
 
				-	inc_ds(cache, cell->holder, cell);
			
 
				-	if (bio_data_dir(cell->holder) == WRITE)
			
 
				-		detail->any_writes = true;
			
 
				-
			
 
				-	while ((bio = bio_list_pop(&cell->bios))) {
			
 
				-		if (discard_or_flush(bio)) {
			
 
				-			bio_list_add(&detail->unhandled_bios, bio);
			
 
				-			continue;
			
 
				+	int r = policy_invalidate_mapping(cache->policy, cblock);
			
 
				+	if (!r) {
			
 
				+		r = dm_cache_remove_mapping(cache->cmd, cblock);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
			
 
				+				    cache_device_name(cache));
			
 
				+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
			
 
				 		}
			
 
				 
			
 
				-		if (bio_data_dir(bio) == WRITE)
			
 
				-			detail->any_writes = true;
			
 
				+	} else if (r == -ENODATA) {
			
 
				+		/*
			
 
				+		 * Harmless, already unmapped.
			
 
				+		 */
			
 
				+		r = 0;
			
 
				 
			
 
				-		bio_list_add(&detail->bios_for_issue, bio);
			
 
				-		inc_ds(cache, bio, cell);
			
 
				-	}
			
 
				+	} else
			
 
				+		DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
			
 
				+
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-// FIXME: refactor these two
			
 
				-static void remap_cell_to_origin_clear_discard(struct cache *cache,
			
 
				-					       struct dm_bio_prison_cell *cell,
			
 
				-					       dm_oblock_t oblock, bool issue_holder)
			
 
				+static void invalidate_remove(struct work_struct *ws)
			
 
				 {
			
 
				-	struct bio *bio;
			
 
				-	unsigned long flags;
			
 
				-	struct inc_detail detail;
			
 
				-
			
 
				-	detail.cache = cache;
			
 
				-	bio_list_init(&detail.bios_for_issue);
			
 
				-	bio_list_init(&detail.unhandled_bios);
			
 
				-	detail.any_writes = false;
			
 
				-
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
			
 
				-	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-
			
 
				-	remap_to_origin(cache, cell->holder);
			
 
				-	if (issue_holder)
			
 
				-		issue(cache, cell->holder);
			
 
				-	else
			
 
				-		accounted_begin(cache, cell->holder);
			
 
				-
			
 
				-	if (detail.any_writes)
			
 
				-		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				+	int r;
			
 
				+	struct dm_cache_migration *mg = ws_to_mg(ws);
			
 
				+	struct cache *cache = mg->cache;
			
 
				 
			
 
				-	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
			
 
				-		remap_to_origin(cache, bio);
			
 
				-		issue(cache, bio);
			
 
				+	r = invalidate_cblock(cache, mg->invalidate_cblock);
			
 
				+	if (r) {
			
 
				+		invalidate_complete(mg, false);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-	free_prison_cell(cache, cell);
			
 
				+	init_continuation(&mg->k, invalidate_completed);
			
 
				+	continue_after_commit(&cache->committer, &mg->k);
			
 
				+	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
			
 
				+	mg->overwrite_bio = NULL;
			
 
				+	schedule_commit(&cache->committer);
			
 
				 }
			
 
				 
			
 
				-static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				-				      dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
			
 
				+static int invalidate_lock(struct dm_cache_migration *mg)
			
 
				 {
			
 
				-	struct bio *bio;
			
 
				-	unsigned long flags;
			
 
				-	struct inc_detail detail;
			
 
				-
			
 
				-	detail.cache = cache;
			
 
				-	bio_list_init(&detail.bios_for_issue);
			
 
				-	bio_list_init(&detail.unhandled_bios);
			
 
				-	detail.any_writes = false;
			
 
				-
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
			
 
				-	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-
			
 
				-	remap_to_cache(cache, cell->holder, cblock);
			
 
				-	if (issue_holder)
			
 
				-		issue(cache, cell->holder);
			
 
				-	else
			
 
				-		accounted_begin(cache, cell->holder);
			
 
				+	int r;
			
 
				+	struct dm_cell_key_v2 key;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+	struct dm_bio_prison_cell_v2 *prealloc;
			
 
				 
			
 
				-	if (detail.any_writes) {
			
 
				-		set_dirty(cache, oblock, cblock);
			
 
				-		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				+	prealloc = alloc_prison_cell(cache);
			
 
				+	if (!prealloc) {
			
 
				+		invalidate_complete(mg, false);
			
 
				+		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				-	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
			
 
				-		remap_to_cache(cache, bio, cblock);
			
 
				-		issue(cache, bio);
			
 
				+	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
			
 
				+	r = dm_cell_lock_v2(cache->prison, &key,
			
 
				+			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
			
 
				+	if (r < 0) {
			
 
				+		free_prison_cell(cache, prealloc);
			
 
				+		invalidate_complete(mg, false);
			
 
				+		return r;
			
 
				 	}
			
 
				 
			
 
				-	free_prison_cell(cache, cell);
			
 
				-}
			
 
				+	if (mg->cell != prealloc)
			
 
				+		free_prison_cell(cache, prealloc);
			
 
				 
			
 
				-/*----------------------------------------------------------------*/
			
 
				+	if (r)
			
 
				+		quiesce(mg, invalidate_remove);
			
 
				 
			
 
				-struct old_oblock_lock {
			
 
				-	struct policy_locker locker;
			
 
				-	struct cache *cache;
			
 
				-	struct prealloc *structs;
			
 
				-	struct dm_bio_prison_cell *cell;
			
 
				-};
			
 
				+	else {
			
 
				+		/*
			
 
				+		 * We can't call invalidate_remove() directly here because we
			
 
				+		 * might still be in request context.
			
 
				+		 */
			
 
				+		init_continuation(&mg->k, invalidate_remove);
			
 
				+		queue_work(cache->wq, &mg->k.ws);
			
 
				+	}
			
 
				 
			
 
				-static int null_locker(struct policy_locker *locker, dm_oblock_t b)
			
 
				-{
			
 
				-	/* This should never be called */
			
 
				-	BUG();
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
			
 
				+static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
			
 
				+			    dm_oblock_t oblock, struct bio *bio)
			
 
				 {
			
 
				-	struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
			
 
				-	struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
			
 
				-
			
 
				-	return bio_detain(l->cache, b, NULL, cell_prealloc,
			
 
				-			  (cell_free_fn) prealloc_put_cell,
			
 
				-			  l->structs, &l->cell);
			
 
				-}
			
 
				-
			
 
				-static void process_cell(struct cache *cache, struct prealloc *structs,
			
 
				-			 struct dm_bio_prison_cell *new_ocell)
			
 
				-{
			
 
				-	int r;
			
 
				-	bool release_cell = true;
			
 
				-	struct bio *bio = new_ocell->holder;
			
 
				-	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				-	struct policy_result lookup_result;
			
 
				-	bool passthrough = passthrough_mode(&cache->features);
			
 
				-	bool fast_promotion, can_migrate;
			
 
				-	struct old_oblock_lock ool;
			
 
				-
			
 
				-	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
			
 
				-	can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
			
 
				-
			
 
				-	ool.locker.fn = cell_locker;
			
 
				-	ool.cache = cache;
			
 
				-	ool.structs = structs;
			
 
				-	ool.cell = NULL;
			
 
				-	r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
			
 
				-		       bio, &ool.locker, &lookup_result);
			
 
				-
			
 
				-	if (r == -EWOULDBLOCK)
			
 
				-		/* migration has been denied */
			
 
				-		lookup_result.op = POLICY_MISS;
			
 
				-
			
 
				-	switch (lookup_result.op) {
			
 
				-	case POLICY_HIT:
			
 
				-		if (passthrough) {
			
 
				-			inc_miss_counter(cache, bio);
			
 
				-
			
 
				-			/*
			
 
				-			 * Passthrough always maps to the origin,
			
 
				-			 * invalidating any cache blocks that are written
			
 
				-			 * to.
			
 
				-			 */
			
 
				-
			
 
				-			if (bio_data_dir(bio) == WRITE) {
			
 
				-				atomic_inc(&cache->stats.demotion);
			
 
				-				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
			
 
				-				release_cell = false;
			
 
				+	struct dm_cache_migration *mg;
			
 
				 
			
 
				-			} else {
			
 
				-				/* FIXME: factor out issue_origin() */
			
 
				-				remap_to_origin_clear_discard(cache, bio, block);
			
 
				-				inc_and_issue(cache, bio, new_ocell);
			
 
				-			}
			
 
				-		} else {
			
 
				-			inc_hit_counter(cache, bio);
			
 
				-
			
 
				-			if (bio_data_dir(bio) == WRITE &&
			
 
				-			    writethrough_mode(&cache->features) &&
			
 
				-			    !is_dirty(cache, lookup_result.cblock)) {
			
 
				-				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
			
 
				-				inc_and_issue(cache, bio, new_ocell);
			
 
				-
			
 
				-			} else {
			
 
				-				remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
			
 
				-				release_cell = false;
			
 
				-			}
			
 
				-		}
			
 
				+	if (!background_work_begin(cache))
			
 
				+		return -EPERM;
			
 
				 
			
 
				-		break;
			
 
				+	mg = alloc_migration(cache);
			
 
				+	if (!mg) {
			
 
				+		background_work_end(cache);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				 
			
 
				-	case POLICY_MISS:
			
 
				-		inc_miss_counter(cache, bio);
			
 
				-		remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
			
 
				-		release_cell = false;
			
 
				-		break;
			
 
				+	memset(mg, 0, sizeof(*mg));
			
 
				 
			
 
				-	case POLICY_NEW:
			
 
				-		atomic_inc(&cache->stats.promotion);
			
 
				-		promote(cache, structs, block, lookup_result.cblock, new_ocell);
			
 
				-		release_cell = false;
			
 
				-		break;
			
 
				+	mg->cache = cache;
			
 
				+	mg->overwrite_bio = bio;
			
 
				+	mg->invalidate_cblock = cblock;
			
 
				+	mg->invalidate_oblock = oblock;
			
 
				 
			
 
				-	case POLICY_REPLACE:
			
 
				-		atomic_inc(&cache->stats.demotion);
			
 
				-		atomic_inc(&cache->stats.promotion);
			
 
				-		demote_then_promote(cache, structs, lookup_result.old_oblock,
			
 
				-				    block, lookup_result.cblock,
			
 
				-				    ool.cell, new_ocell);
			
 
				-		release_cell = false;
			
 
				-		break;
			
 
				+	return invalidate_lock(mg);
			
 
				+}
			
 
				 
			
 
				-	default:
			
 
				-		DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
			
 
				-			    cache_device_name(cache), __func__,
			
 
				-			    (unsigned) lookup_result.op);
			
 
				-		bio_io_error(bio);
			
 
				-	}
			
 
				+/*----------------------------------------------------------------
			
 
				+ * bio processing
			
 
				+ *--------------------------------------------------------------*/
			
 
				 
			
 
				-	if (release_cell)
			
 
				-		cell_defer(cache, new_ocell, false);
			
 
				-}
			
 
				+enum busy {
			
 
				+	IDLE,
			
 
				+	MODERATE,
			
 
				+	BUSY
			
 
				+};
			
 
				 
			
 
				-static void process_bio(struct cache *cache, struct prealloc *structs,
			
 
				-			struct bio *bio)
			
 
				+static enum busy spare_migration_bandwidth(struct cache *cache)
			
 
				 {
			
 
				-	int r;
			
 
				-	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				-	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
			
 
				-
			
 
				-	/*
			
 
				-	 * Check to see if that block is currently migrating.
			
 
				-	 */
			
 
				-	cell_prealloc = prealloc_get_cell(structs);
			
 
				-	r = bio_detain(cache, block, bio, cell_prealloc,
			
 
				-		       (cell_free_fn) prealloc_put_cell,
			
 
				-		       structs, &new_ocell);
			
 
				-	if (r > 0)
			
 
				-		return;
			
 
				+	bool idle = iot_idle_for(&cache->origin_tracker, HZ);
			
 
				+	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
			
 
				+		cache->sectors_per_block;
			
 
				 
			
 
				-	process_cell(cache, structs, new_ocell);
			
 
				+	if (current_volume <= cache->migration_threshold)
			
 
				+		return idle ? IDLE : MODERATE;
			
 
				+	else
			
 
				+		return idle ? MODERATE : BUSY;
			
 
				 }
			
 
				 
			
 
				-static int need_commit_due_to_time(struct cache *cache)
			
 
				+static void inc_hit_counter(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	return jiffies < cache->last_commit_jiffies ||
			
 
				-	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
			
 
				+	atomic_inc(bio_data_dir(bio) == READ ?
			
 
				+		   &cache->stats.read_hit : &cache->stats.write_hit);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * A non-zero return indicates read_only or fail_io mode.
			
 
				- */
			
 
				-static int commit(struct cache *cache, bool clean_shutdown)
			
 
				+static void inc_miss_counter(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	int r;
			
 
				-
			
 
				-	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				-		return -EINVAL;
			
 
				+	atomic_inc(bio_data_dir(bio) == READ ?
			
 
				+		   &cache->stats.read_miss : &cache->stats.write_miss);
			
 
				+}
			
 
				 
			
 
				-	atomic_inc(&cache->stats.commit_count);
			
 
				-	r = dm_cache_commit(cache->cmd, clean_shutdown);
			
 
				-	if (r)
			
 
				-		metadata_operation_failed(cache, "dm_cache_commit", r);
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				-	return r;
			
 
				+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	return (bio_data_dir(bio) == WRITE) &&
			
 
				+		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
			
 
				 }
			
 
				 
			
 
				-static int commit_if_needed(struct cache *cache)
			
 
				+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
			
 
				 {
			
 
				-	int r = 0;
			
 
				-
			
 
				-	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
			
 
				-	    dm_cache_changed_this_transaction(cache->cmd)) {
			
 
				-		r = commit(cache, false);
			
 
				-		cache->commit_requested = false;
			
 
				-		cache->last_commit_jiffies = jiffies;
			
 
				-	}
			
 
				-
			
 
				-	return r;
			
 
				+	return writeback_mode(&cache->features) &&
			
 
				+		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
			
 
				 }
			
 
				 
			
 
				-static void process_deferred_bios(struct cache *cache)
			
 
				+static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
			
 
				+		   bool *commit_needed)
			
 
				 {
			
 
				-	bool prealloc_used = false;
			
 
				-	unsigned long flags;
			
 
				-	struct bio_list bios;
			
 
				-	struct bio *bio;
			
 
				-	struct prealloc structs;
			
 
				-
			
 
				-	memset(&structs, 0, sizeof(structs));
			
 
				-	bio_list_init(&bios);
			
 
				+	int r, data_dir;
			
 
				+	bool rb, background_queued;
			
 
				+	dm_cblock_t cblock;
			
 
				+	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	bio_list_merge(&bios, &cache->deferred_bios);
			
 
				-	bio_list_init(&cache->deferred_bios);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	*commit_needed = false;
			
 
				 
			
 
				-	while (!bio_list_empty(&bios)) {
			
 
				+	rb = bio_detain_shared(cache, block, bio);
			
 
				+	if (!rb) {
			
 
				 		/*
			
 
				-		 * If we've got no free migration structs, and processing
			
 
				-		 * this bio might require one, we pause until there are some
			
 
				-		 * prepared mappings to process.
			
 
				+		 * An exclusive lock is held for this block, so we have to
			
 
				+		 * wait.  We set the commit_needed flag so the current
			
 
				+		 * transaction will be committed asap, allowing this lock
			
 
				+		 * to be dropped.
			
 
				 		 */
			
 
				-		prealloc_used = true;
			
 
				-		if (prealloc_data_structs(cache, &structs)) {
			
 
				-			spin_lock_irqsave(&cache->lock, flags);
			
 
				-			bio_list_merge(&cache->deferred_bios, &bios);
			
 
				-			spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-			break;
			
 
				+		*commit_needed = true;
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	data_dir = bio_data_dir(bio);
			
 
				+
			
 
				+	if (optimisable_bio(cache, bio, block)) {
			
 
				+		struct policy_work *op = NULL;
			
 
				+
			
 
				+		r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
			
 
				+		if (unlikely(r && r != -ENOENT)) {
			
 
				+			DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
			
 
				+				    cache_device_name(cache), r);
			
 
				+			bio_io_error(bio);
			
 
				+			return DM_MAPIO_SUBMITTED;
			
 
				 		}
			
 
				 
			
 
				-		bio = bio_list_pop(&bios);
			
 
				+		if (r == -ENOENT && op) {
			
 
				+			bio_drop_shared_lock(cache, bio);
			
 
				+			BUG_ON(op->op != POLICY_PROMOTE);
			
 
				+			mg_start(cache, op, bio);
			
 
				+			return DM_MAPIO_SUBMITTED;
			
 
				+		}
			
 
				+	} else {
			
 
				+		r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
			
 
				+		if (unlikely(r && r != -ENOENT)) {
			
 
				+			DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
			
 
				+				    cache_device_name(cache), r);
			
 
				+			bio_io_error(bio);
			
 
				+			return DM_MAPIO_SUBMITTED;
			
 
				+		}
			
 
				 
			
 
				-		if (bio->bi_opf & REQ_PREFLUSH)
			
 
				-			process_flush_bio(cache, bio);
			
 
				-		else if (bio_op(bio) == REQ_OP_DISCARD)
			
 
				-			process_discard_bio(cache, &structs, bio);
			
 
				-		else
			
 
				-			process_bio(cache, &structs, bio);
			
 
				+		if (background_queued)
			
 
				+			wake_migration_worker(cache);
			
 
				 	}
			
 
				 
			
 
				-	if (prealloc_used)
			
 
				-		prealloc_free_structs(cache, &structs);
			
 
				-}
			
 
				-
			
 
				-static void process_deferred_cells(struct cache *cache)
			
 
				-{
			
 
				-	bool prealloc_used = false;
			
 
				-	unsigned long flags;
			
 
				-	struct dm_bio_prison_cell *cell, *tmp;
			
 
				-	struct list_head cells;
			
 
				-	struct prealloc structs;
			
 
				+	if (r == -ENOENT) {
			
 
				+		/*
			
 
				+		 * Miss.
			
 
				+		 */
			
 
				+		inc_miss_counter(cache, bio);
			
 
				+		if (pb->req_nr == 0) {
			
 
				+			accounted_begin(cache, bio);
			
 
				+			remap_to_origin_clear_discard(cache, bio, block);
			
 
				 
			
 
				-	memset(&structs, 0, sizeof(structs));
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * This is a duplicate writethrough io that is no
			
 
				+			 * longer needed because the block has been demoted.
			
 
				+			 */
			
 
				+			bio_endio(bio);
			
 
				+			return DM_MAPIO_SUBMITTED;
			
 
				+		}
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * Hit.
			
 
				+		 */
			
 
				+		inc_hit_counter(cache, bio);
			
 
				 
			
 
				-	INIT_LIST_HEAD(&cells);
			
 
				+		/*
			
 
				+		 * Passthrough always maps to the origin, invalidating any
			
 
				+		 * cache blocks that are written to.
			
 
				+		 */
			
 
				+		if (passthrough_mode(&cache->features)) {
			
 
				+			if (bio_data_dir(bio) == WRITE) {
			
 
				+				bio_drop_shared_lock(cache, bio);
			
 
				+				atomic_inc(&cache->stats.demotion);
			
 
				+				invalidate_start(cache, cblock, block, bio);
			
 
				+			} else
			
 
				+				remap_to_origin_clear_discard(cache, bio, block);
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_splice_init(&cache->deferred_cells, &cells);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+		} else {
			
 
				+			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
			
 
				+			    !is_dirty(cache, cblock)) {
			
 
				+				remap_to_origin_then_cache(cache, bio, block, cblock);
			
 
				+				accounted_begin(cache, bio);
			
 
				+			} else
			
 
				+				remap_to_cache_dirty(cache, bio, block, cblock);
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
			
 
				+	/*
			
 
				+	 * dm core turns FUA requests into a separate payload and FLUSH req.
			
 
				+	 */
			
 
				+	if (bio->bi_opf & REQ_FUA) {
			
 
				 		/*
			
 
				-		 * If we've got no free migration structs, and processing
			
 
				-		 * this bio might require one, we pause until there are some
			
 
				-		 * prepared mappings to process.
			
 
				+		 * issue_after_commit will call accounted_begin a second time.  So
			
 
				+		 * we call accounted_complete() to avoid double accounting.
			
 
				 		 */
			
 
				-		prealloc_used = true;
			
 
				-		if (prealloc_data_structs(cache, &structs)) {
			
 
				-			spin_lock_irqsave(&cache->lock, flags);
			
 
				-			list_splice(&cells, &cache->deferred_cells);
			
 
				-			spin_unlock_irqrestore(&cache->lock, flags);
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		process_cell(cache, &structs, cell);
			
 
				+		accounted_complete(cache, bio);
			
 
				+		issue_after_commit(&cache->committer, bio);
			
 
				+		*commit_needed = true;
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				 	}
			
 
				 
			
 
				-	if (prealloc_used)
			
 
				-		prealloc_free_structs(cache, &structs);
			
 
				+	return DM_MAPIO_REMAPPED;
			
 
				 }
			
 
				 
			
 
				-static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
			
 
				+static bool process_bio(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-	struct bio_list bios;
			
 
				-	struct bio *bio;
			
 
				-
			
 
				-	bio_list_init(&bios);
			
 
				+	bool commit_needed;
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	bio_list_merge(&bios, &cache->deferred_flush_bios);
			
 
				-	bio_list_init(&cache->deferred_flush_bios);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
			
 
				+		generic_make_request(bio);
			
 
				 
			
 
				-	/*
			
 
				-	 * These bios have already been through inc_ds()
			
 
				-	 */
			
 
				-	while ((bio = bio_list_pop(&bios)))
			
 
				-		submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
			
 
				+	return commit_needed;
			
 
				 }
			
 
				 
			
 
				-static void process_deferred_writethrough_bios(struct cache *cache)
			
 
				+/*
			
 
				+ * A non-zero return indicates read_only or fail_io mode.
			
 
				+ */
			
 
				+static int commit(struct cache *cache, bool clean_shutdown)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-	struct bio_list bios;
			
 
				-	struct bio *bio;
			
 
				+	int r;
			
 
				 
			
 
				-	bio_list_init(&bios);
			
 
				+	if (get_cache_mode(cache) >= CM_READ_ONLY)
			
 
				+		return -EINVAL;
			
 
				 
			
 
				-	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
			
 
				-	bio_list_init(&cache->deferred_writethrough_bios);
			
 
				-	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	atomic_inc(&cache->stats.commit_count);
			
 
				+	r = dm_cache_commit(cache->cmd, clean_shutdown);
			
 
				+	if (r)
			
 
				+		metadata_operation_failed(cache, "dm_cache_commit", r);
			
 
				 
			
 
				-	/*
			
 
				-	 * These bios have already been through inc_ds()
			
 
				-	 */
			
 
				-	while ((bio = bio_list_pop(&bios)))
			
 
				-		accounted_request(cache, bio);
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static void writeback_some_dirty_blocks(struct cache *cache)
			
 
				+/*
			
 
				+ * Used by the batcher.
			
 
				+ */
			
 
				+static int commit_op(void *context)
			
 
				 {
			
 
				-	bool prealloc_used = false;
			
 
				-	dm_oblock_t oblock;
			
 
				-	dm_cblock_t cblock;
			
 
				-	struct prealloc structs;
			
 
				-	struct dm_bio_prison_cell *old_ocell;
			
 
				-	bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
			
 
				-
			
 
				-	memset(&structs, 0, sizeof(structs));
			
 
				-
			
 
				-	while (spare_migration_bandwidth(cache)) {
			
 
				-		if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
			
 
				-			break; /* no work to do */
			
 
				-
			
 
				-		prealloc_used = true;
			
 
				-		if (prealloc_data_structs(cache, &structs) ||
			
 
				-		    get_cell(cache, oblock, &structs, &old_ocell)) {
			
 
				-			policy_set_dirty(cache->policy, oblock);
			
 
				-			break;
			
 
				-		}
			
 
				+	struct cache *cache = context;
			
 
				 
			
 
				-		writeback(cache, &structs, oblock, cblock, old_ocell);
			
 
				-	}
			
 
				+	if (dm_cache_changed_this_transaction(cache->cmd))
			
 
				+		return commit(cache, false);
			
 
				 
			
 
				-	if (prealloc_used)
			
 
				-		prealloc_free_structs(cache, &structs);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-/*----------------------------------------------------------------
			
 
				- * Invalidations.
			
 
				- * Dropping something from the cache *without* writing back.
			
 
				- *--------------------------------------------------------------*/
			
 
				+/*----------------------------------------------------------------*/
			
 
				 
			
 
				-static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
			
 
				+static bool process_flush_bio(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	int r = 0;
			
 
				-	uint64_t begin = from_cblock(req->cblocks->begin);
			
 
				-	uint64_t end = from_cblock(req->cblocks->end);
			
 
				-
			
 
				-	while (begin != end) {
			
 
				-		r = policy_remove_cblock(cache->policy, to_cblock(begin));
			
 
				-		if (!r) {
			
 
				-			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
			
 
				-			if (r) {
			
 
				-				metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
			
 
				-				break;
			
 
				-			}
			
 
				-
			
 
				-		} else if (r == -ENODATA) {
			
 
				-			/* harmless, already unmapped */
			
 
				-			r = 0;
			
 
				-
			
 
				-		} else {
			
 
				-			DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		begin++;
			
 
				-        }
			
 
				-
			
 
				-	cache->commit_requested = true;
			
 
				+	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
			
 
				 
			
 
				-	req->err = r;
			
 
				-	atomic_set(&req->complete, 1);
			
 
				+	if (!pb->req_nr)
			
 
				+		remap_to_origin(cache, bio);
			
 
				+	else
			
 
				+		remap_to_cache(cache, bio, 0);
			
 
				 
			
 
				-	wake_up(&req->result_wait);
			
 
				+	issue_after_commit(&cache->committer, bio);
			
 
				+	return true;
			
 
				 }
			
 
				 
			
 
				-static void process_invalidation_requests(struct cache *cache)
			
 
				+static bool process_discard_bio(struct cache *cache, struct bio *bio)
			
 
				 {
			
 
				-	struct list_head list;
			
 
				-	struct invalidation_request *req, *tmp;
			
 
				+	dm_dblock_t b, e;
			
 
				 
			
 
				-	INIT_LIST_HEAD(&list);
			
 
				-	spin_lock(&cache->invalidation_lock);
			
 
				-	list_splice_init(&cache->invalidation_requests, &list);
			
 
				-	spin_unlock(&cache->invalidation_lock);
			
 
				+	// FIXME: do we need to lock the region?  Or can we just assume the
			
 
				+	// user wont be so foolish as to issue discard concurrently with
			
 
				+	// other IO?
			
 
				+	calc_discard_block_range(cache, bio, &b, &e);
			
 
				+	while (b != e) {
			
 
				+		set_discard(cache, b);
			
 
				+		b = to_dblock(from_dblock(b) + 1);
			
 
				+	}
			
 
				 
			
 
				-	list_for_each_entry_safe (req, tmp, &list, list)
			
 
				-		process_invalidation_request(cache, req);
			
 
				-}
			
 
				+	bio_endio(bio);
			
 
				 
			
 
				-/*----------------------------------------------------------------
			
 
				- * Main worker loop
			
 
				- *--------------------------------------------------------------*/
			
 
				-static bool is_quiescing(struct cache *cache)
			
 
				-{
			
 
				-	return atomic_read(&cache->quiescing);
			
 
				+	return false;
			
 
				 }
			
 
				 
			
 
				-static void ack_quiescing(struct cache *cache)
			
 
				+static void process_deferred_bios(struct work_struct *ws)
			
 
				 {
			
 
				-	if (is_quiescing(cache)) {
			
 
				-		atomic_inc(&cache->quiescing_ack);
			
 
				-		wake_up(&cache->quiescing_wait);
			
 
				-	}
			
 
				-}
			
 
				+	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
			
 
				 
			
 
				-static void wait_for_quiescing_ack(struct cache *cache)
			
 
				-{
			
 
				-	wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
			
 
				-}
			
 
				+	unsigned long flags;
			
 
				+	bool commit_needed = false;
			
 
				+	struct bio_list bios;
			
 
				+	struct bio *bio;
			
 
				 
			
 
				-static void start_quiescing(struct cache *cache)
			
 
				-{
			
 
				-	atomic_inc(&cache->quiescing);
			
 
				-	wait_for_quiescing_ack(cache);
			
 
				-}
			
 
				+	bio_list_init(&bios);
			
 
				 
			
 
				-static void stop_quiescing(struct cache *cache)
			
 
				-{
			
 
				-	atomic_set(&cache->quiescing, 0);
			
 
				-	atomic_set(&cache->quiescing_ack, 0);
			
 
				-}
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	bio_list_merge(&bios, &cache->deferred_bios);
			
 
				+	bio_list_init(&cache->deferred_bios);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				 
			
 
				-static void wait_for_migrations(struct cache *cache)
			
 
				-{
			
 
				-	wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
			
 
				-}
			
 
				+	while ((bio = bio_list_pop(&bios))) {
			
 
				+		if (bio->bi_opf & REQ_PREFLUSH)
			
 
				+			commit_needed = process_flush_bio(cache, bio) || commit_needed;
			
 
				 
			
 
				-static void stop_worker(struct cache *cache)
			
 
				-{
			
 
				-	cancel_delayed_work(&cache->waker);
			
 
				-	flush_workqueue(cache->wq);
			
 
				+		else if (bio_op(bio) == REQ_OP_DISCARD)
			
 
				+			commit_needed = process_discard_bio(cache, bio) || commit_needed;
			
 
				+
			
 
				+		else
			
 
				+			commit_needed = process_bio(cache, bio) || commit_needed;
			
 
				+	}
			
 
				+
			
 
				+	if (commit_needed)
			
 
				+		schedule_commit(&cache->committer);
			
 
				 }
			
 
				 
			
 
				-static void requeue_deferred_cells(struct cache *cache)
			
 
				+static void process_deferred_writethrough_bios(struct work_struct *ws)
			
 
				 {
			
 
				+	struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
			
 
				+
			
 
				 	unsigned long flags;
			
 
				-	struct list_head cells;
			
 
				-	struct dm_bio_prison_cell *cell, *tmp;
			
 
				+	struct bio_list bios;
			
 
				+	struct bio *bio;
			
 
				+
			
 
				+	bio_list_init(&bios);
			
 
				 
			
 
				-	INIT_LIST_HEAD(&cells);
			
 
				 	spin_lock_irqsave(&cache->lock, flags);
			
 
				-	list_splice_init(&cache->deferred_cells, &cells);
			
 
				+	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
			
 
				+	bio_list_init(&cache->deferred_writethrough_bios);
			
 
				 	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				 
			
 
				-	list_for_each_entry_safe(cell, tmp, &cells, user_list)
			
 
				-		cell_requeue(cache, cell);
			
 
				+	/*
			
 
				+	 * These bios have already been through accounted_begin()
			
 
				+	 */
			
 
				+	while ((bio = bio_list_pop(&bios)))
			
 
				+		generic_make_request(bio);
			
 
				 }
			
 
				 
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Main worker loop
			
 
				+ *--------------------------------------------------------------*/
			
 
				+
			
 
				 static void requeue_deferred_bios(struct cache *cache)
			
 
				 {
			
 
				 	struct bio *bio;
			
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int more_work(struct cache *cache)
			
 
				-{
			
 
				-	if (is_quiescing(cache))
			
 
				-		return !list_empty(&cache->quiesced_migrations) ||
			
 
				-			!list_empty(&cache->completed_migrations) ||
			
 
				-			!list_empty(&cache->need_commit_migrations);
			
 
				-	else
			
 
				-		return !bio_list_empty(&cache->deferred_bios) ||
			
 
				-			!list_empty(&cache->deferred_cells) ||
			
 
				-			!bio_list_empty(&cache->deferred_flush_bios) ||
			
 
				-			!bio_list_empty(&cache->deferred_writethrough_bios) ||
			
 
				-			!list_empty(&cache->quiesced_migrations) ||
			
 
				-			!list_empty(&cache->completed_migrations) ||
			
 
				-			!list_empty(&cache->need_commit_migrations) ||
			
 
				-			cache->invalidate;
			
 
				-}
			
 
				-
			
 
				-static void do_worker(struct work_struct *ws)
			
 
				-{
			
 
				-	struct cache *cache = container_of(ws, struct cache, worker);
			
 
				-
			
 
				-	do {
			
 
				-		if (!is_quiescing(cache)) {
			
 
				-			writeback_some_dirty_blocks(cache);
			
 
				-			process_deferred_writethrough_bios(cache);
			
 
				-			process_deferred_bios(cache);
			
 
				-			process_deferred_cells(cache);
			
 
				-			process_invalidation_requests(cache);
			
 
				-		}
			
 
				-
			
 
				-		process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
			
 
				-		process_migrations(cache, &cache->completed_migrations, complete_migration);
			
 
				-
			
 
				-		if (commit_if_needed(cache)) {
			
 
				-			process_deferred_flush_bios(cache, false);
			
 
				-			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
			
 
				-		} else {
			
 
				-			process_deferred_flush_bios(cache, true);
			
 
				-			process_migrations(cache, &cache->need_commit_migrations,
			
 
				-					   migration_success_post_commit);
			
 
				-		}
			
 
				-
			
 
				-		ack_quiescing(cache);
			
 
				-
			
 
				-	} while (more_work(cache));
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * We want to commit periodically so that not too much
			
 
				  * unwritten metadata builds up.
			
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws)
 
				 static void do_waker(struct work_struct *ws)
			
 
				 {
			
 
				 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
			
 
				+
			
 
				 	policy_tick(cache->policy, true);
			
 
				-	wake_worker(cache);
			
 
				+	wake_migration_worker(cache);
			
 
				+	schedule_commit(&cache->committer);
			
 
				 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
			
 
				 }
			
 
				 
			
 
				-/*----------------------------------------------------------------*/
			
 
				-
			
 
				-static int is_congested(struct dm_dev *dev, int bdi_bits)
			
 
				+static void check_migrations(struct work_struct *ws)
			
 
				 {
			
 
				-	struct request_queue *q = bdev_get_queue(dev->bdev);
			
 
				-	return bdi_congested(q->backing_dev_info, bdi_bits);
			
 
				-}
			
 
				+	int r;
			
 
				+	struct policy_work *op;
			
 
				+	struct cache *cache = container_of(ws, struct cache, migration_worker);
			
 
				+	enum busy b;
			
 
				 
			
 
				-static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
			
 
				-{
			
 
				-	struct cache *cache = container_of(cb, struct cache, callbacks);
			
 
				+	for (;;) {
			
 
				+		b = spare_migration_bandwidth(cache);
			
 
				+		if (b == BUSY)
			
 
				+			break;
			
 
				 
			
 
				-	return is_congested(cache->origin_dev, bdi_bits) ||
			
 
				-		is_congested(cache->cache_dev, bdi_bits);
			
 
				+		r = policy_get_background_work(cache->policy, b == IDLE, &op);
			
 
				+		if (r == -ENODATA)
			
 
				+			break;
			
 
				+
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("%s: policy_background_work failed",
			
 
				+				    cache_device_name(cache));
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		r = mg_start(cache, op, NULL);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*----------------------------------------------------------------
			
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache)
 
				 
			
 
				 	mempool_destroy(cache->migration_pool);
			
 
				 
			
 
				-	if (cache->all_io_ds)
			
 
				-		dm_deferred_set_destroy(cache->all_io_ds);
			
 
				-
			
 
				 	if (cache->prison)
			
 
				-		dm_bio_prison_destroy(cache->prison);
			
 
				+		dm_bio_prison_destroy_v2(cache->prison);
			
 
				 
			
 
				 	if (cache->wq)
			
 
				 		destroy_workqueue(cache->wq);
			
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
 
				 		return PTR_ERR(p);
			
 
				 	}
			
 
				 	cache->policy = p;
			
 
				+	BUG_ON(!cache->policy);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
 
				 	cache->cache_size = size;
			
 
				 }
			
 
				 
			
 
				+static int is_congested(struct dm_dev *dev, int bdi_bits)
			
 
				+{
			
 
				+	struct request_queue *q = bdev_get_queue(dev->bdev);
			
 
				+	return bdi_congested(q->backing_dev_info, bdi_bits);
			
 
				+}
			
 
				+
			
 
				+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
			
 
				+{
			
 
				+	struct cache *cache = container_of(cb, struct cache, callbacks);
			
 
				+
			
 
				+	return is_congested(cache->origin_dev, bdi_bits) ||
			
 
				+		is_congested(cache->cache_dev, bdi_bits);
			
 
				+}
			
 
				+
			
 
				 #define DEFAULT_MIGRATION_THRESHOLD 2048
			
 
				 
			
 
				 static int cache_create(struct cache_args *ca, struct cache **result)
			
@@ -2787,7 +2567,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 
			
 
				 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
			
 
				 
			
 
				-	/* FIXME: factor out this whole section */
			
 
				 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
			
 
				 	origin_blocks = block_div(origin_blocks, ca->block_size);
			
 
				 	cache->origin_blocks = to_oblock(origin_blocks);
			
@@ -2853,24 +2632,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 			r = -EINVAL;
			
 
				 			goto bad;
			
 
				 		}
			
 
				+
			
 
				+		policy_allow_migrations(cache->policy, false);
			
 
				 	}
			
 
				 
			
 
				 	spin_lock_init(&cache->lock);
			
 
				 	INIT_LIST_HEAD(&cache->deferred_cells);
			
 
				 	bio_list_init(&cache->deferred_bios);
			
 
				-	bio_list_init(&cache->deferred_flush_bios);
			
 
				 	bio_list_init(&cache->deferred_writethrough_bios);
			
 
				-	INIT_LIST_HEAD(&cache->quiesced_migrations);
			
 
				-	INIT_LIST_HEAD(&cache->completed_migrations);
			
 
				-	INIT_LIST_HEAD(&cache->need_commit_migrations);
			
 
				 	atomic_set(&cache->nr_allocated_migrations, 0);
			
 
				 	atomic_set(&cache->nr_io_migrations, 0);
			
 
				 	init_waitqueue_head(&cache->migration_wait);
			
 
				 
			
 
				-	init_waitqueue_head(&cache->quiescing_wait);
			
 
				-	atomic_set(&cache->quiescing, 0);
			
 
				-	atomic_set(&cache->quiescing_ack, 0);
			
 
				-
			
 
				 	r = -ENOMEM;
			
 
				 	atomic_set(&cache->nr_dirty, 0);
			
 
				 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
			
@@ -2899,27 +2672,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
			
 
				+	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
			
 
				 	if (!cache->wq) {
			
 
				 		*error = "could not create workqueue for metadata object";
			
 
				 		goto bad;
			
 
				 	}
			
 
				-	INIT_WORK(&cache->worker, do_worker);
			
 
				+	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
			
 
				+	INIT_WORK(&cache->deferred_writethrough_worker,
			
 
				+		  process_deferred_writethrough_bios);
			
 
				+	INIT_WORK(&cache->migration_worker, check_migrations);
			
 
				 	INIT_DELAYED_WORK(&cache->waker, do_waker);
			
 
				-	cache->last_commit_jiffies = jiffies;
			
 
				 
			
 
				-	cache->prison = dm_bio_prison_create();
			
 
				+	cache->prison = dm_bio_prison_create_v2(cache->wq);
			
 
				 	if (!cache->prison) {
			
 
				 		*error = "could not create bio prison";
			
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	cache->all_io_ds = dm_deferred_set_create();
			
 
				-	if (!cache->all_io_ds) {
			
 
				-		*error = "could not create all_io deferred set";
			
 
				-		goto bad;
			
 
				-	}
			
 
				-
			
 
				 	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
			
 
				 							 migration_cache);
			
 
				 	if (!cache->migration_pool) {
			
@@ -2946,11 +2715,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
				 	spin_lock_init(&cache->invalidation_lock);
			
 
				 	INIT_LIST_HEAD(&cache->invalidation_requests);
			
 
				 
			
 
				+	batcher_init(&cache->committer, commit_op, cache,
			
 
				+		     issue_op, cache, cache->wq);
			
 
				 	iot_init(&cache->origin_tracker);
			
 
				 
			
 
				+	init_rwsem(&cache->background_work_lock);
			
 
				+	prevent_background_work(cache);
			
 
				+
			
 
				 	*result = cache;
			
 
				 	return 0;
			
 
				-
			
 
				 bad:
			
 
				 	destroy(cache);
			
 
				 	return r;
			
@@ -3008,7 +2781,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	ti->private = cache;
			
 
				-
			
 
				 out:
			
 
				 	destroy_cache_args(ca);
			
 
				 	return r;
			
@@ -3021,17 +2793,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 
				 	struct cache *cache = ti->private;
			
 
				 
			
 
				 	int r;
			
 
				-	struct dm_bio_prison_cell *cell = NULL;
			
 
				+	bool commit_needed;
			
 
				 	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				 	size_t pb_data_size = get_per_bio_data_size(cache);
			
 
				-	bool can_migrate = false;
			
 
				-	bool fast_promotion;
			
 
				-	struct policy_result lookup_result;
			
 
				-	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
			
 
				-	struct old_oblock_lock ool;
			
 
				-
			
 
				-	ool.locker.fn = null_locker;
			
 
				 
			
 
				+	init_per_bio_data(bio, pb_data_size);
			
 
				 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
			
 
				 		/*
			
 
				 		 * This can only occur if the io goes to a partial block at
			
@@ -3048,101 +2814,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 
				 		return DM_MAPIO_SUBMITTED;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Check to see if that block is currently migrating.
			
 
				-	 */
			
 
				-	cell = alloc_prison_cell(cache);
			
 
				-	if (!cell) {
			
 
				-		defer_bio(cache, bio);
			
 
				-		return DM_MAPIO_SUBMITTED;
			
 
				-	}
			
 
				-
			
 
				-	r = bio_detain(cache, block, bio, cell,
			
 
				-		       (cell_free_fn) free_prison_cell,
			
 
				-		       cache, &cell);
			
 
				-	if (r) {
			
 
				-		if (r < 0)
			
 
				-			defer_bio(cache, bio);
			
 
				-
			
 
				-		return DM_MAPIO_SUBMITTED;
			
 
				-	}
			
 
				-
			
 
				-	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
			
 
				-
			
 
				-	r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
			
 
				-		       bio, &ool.locker, &lookup_result);
			
 
				-	if (r == -EWOULDBLOCK) {
			
 
				-		cell_defer(cache, cell, true);
			
 
				-		return DM_MAPIO_SUBMITTED;
			
 
				-
			
 
				-	} else if (r) {
			
 
				-		DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
			
 
				-			    cache_device_name(cache), r);
			
 
				-		cell_defer(cache, cell, false);
			
 
				-		bio_io_error(bio);
			
 
				-		return DM_MAPIO_SUBMITTED;
			
 
				-	}
			
 
				-
			
 
				-	r = DM_MAPIO_REMAPPED;
			
 
				-	switch (lookup_result.op) {
			
 
				-	case POLICY_HIT:
			
 
				-		if (passthrough_mode(&cache->features)) {
			
 
				-			if (bio_data_dir(bio) == WRITE) {
			
 
				-				/*
			
 
				-				 * We need to invalidate this block, so
			
 
				-				 * defer for the worker thread.
			
 
				-				 */
			
 
				-				cell_defer(cache, cell, true);
			
 
				-				r = DM_MAPIO_SUBMITTED;
			
 
				-
			
 
				-			} else {
			
 
				-				inc_miss_counter(cache, bio);
			
 
				-				remap_to_origin_clear_discard(cache, bio, block);
			
 
				-				accounted_begin(cache, bio);
			
 
				-				inc_ds(cache, bio, cell);
			
 
				-				// FIXME: we want to remap hits or misses straight
			
 
				-				// away rather than passing over to the worker.
			
 
				-				cell_defer(cache, cell, false);
			
 
				-			}
			
 
				-
			
 
				-		} else {
			
 
				-			inc_hit_counter(cache, bio);
			
 
				-			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
			
 
				-			    !is_dirty(cache, lookup_result.cblock)) {
			
 
				-				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
			
 
				-				accounted_begin(cache, bio);
			
 
				-				inc_ds(cache, bio, cell);
			
 
				-				cell_defer(cache, cell, false);
			
 
				-
			
 
				-			} else
			
 
				-				remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
			
 
				-		}
			
 
				-		break;
			
 
				-
			
 
				-	case POLICY_MISS:
			
 
				-		inc_miss_counter(cache, bio);
			
 
				-		if (pb->req_nr != 0) {
			
 
				-			/*
			
 
				-			 * This is a duplicate writethrough io that is no
			
 
				-			 * longer needed because the block has been demoted.
			
 
				-			 */
			
 
				-			bio_endio(bio);
			
 
				-			// FIXME: remap everything as a miss
			
 
				-			cell_defer(cache, cell, false);
			
 
				-			r = DM_MAPIO_SUBMITTED;
			
 
				-
			
 
				-		} else
			
 
				-			remap_cell_to_origin_clear_discard(cache, cell, block, false);
			
 
				-		break;
			
 
				-
			
 
				-	default:
			
 
				-		DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
			
 
				-			    cache_device_name(cache), __func__,
			
 
				-			    (unsigned) lookup_result.op);
			
 
				-		cell_defer(cache, cell, false);
			
 
				-		bio_io_error(bio);
			
 
				-		r = DM_MAPIO_SUBMITTED;
			
 
				-	}
			
 
				+	r = map_bio(cache, bio, block, &commit_needed);
			
 
				+	if (commit_needed)
			
 
				+		schedule_commit(&cache->committer);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -3162,7 +2836,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 
				 		spin_unlock_irqrestore(&cache->lock, flags);
			
 
				 	}
			
 
				 
			
 
				-	check_for_quiesced_migrations(cache, pb);
			
 
				+	bio_drop_shared_lock(cache, bio);
			
 
				 	accounted_complete(cache, bio);
			
 
				 
			
 
				 	return 0;
			
@@ -3262,12 +2936,18 @@ static void cache_postsuspend(struct dm_target *ti)
 
				 {
			
 
				 	struct cache *cache = ti->private;
			
 
				 
			
 
				-	start_quiescing(cache);
			
 
				-	wait_for_migrations(cache);
			
 
				-	stop_worker(cache);
			
 
				+	prevent_background_work(cache);
			
 
				+	BUG_ON(atomic_read(&cache->nr_io_migrations));
			
 
				+
			
 
				+	cancel_delayed_work(&cache->waker);
			
 
				+	flush_workqueue(cache->wq);
			
 
				+	WARN_ON(cache->origin_tracker.in_flight);
			
 
				+
			
 
				+	/*
			
 
				+	 * If it's a flush suspend there won't be any deferred bios, so this
			
 
				+	 * call is harmless.
			
 
				+	 */
			
 
				 	requeue_deferred_bios(cache);
			
 
				-	requeue_deferred_cells(cache);
			
 
				-	stop_quiescing(cache);
			
 
				 
			
 
				 	if (get_cache_mode(cache) == CM_WRITE)
			
 
				 		(void) sync_metadata(cache);
			
@@ -3279,15 +2959,16 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 
				 	int r;
			
 
				 	struct cache *cache = context;
			
 
				 
			
 
				-	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
			
 
				+	if (dirty) {
			
 
				+		set_bit(from_cblock(cblock), cache->dirty_bitset);
			
 
				+		atomic_inc(&cache->nr_dirty);
			
 
				+	} else
			
 
				+		clear_bit(from_cblock(cblock), cache->dirty_bitset);
			
 
				+
			
 
				+	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
			
 
				 	if (r)
			
 
				 		return r;
			
 
				 
			
 
				-	if (dirty)
			
 
				-		set_dirty(cache, oblock, cblock);
			
 
				-	else
			
 
				-		clear_dirty(cache, oblock, cblock);
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -3486,6 +3167,7 @@ static void cache_resume(struct dm_target *ti)
 
				 	struct cache *cache = ti->private;
			
 
				 
			
 
				 	cache->need_tick_bio = true;
			
 
				+	allow_background_work(cache);
			
 
				 	do_waker(&cache->waker.work);
			
 
				 }
			
 
				 
			
@@ -3619,11 +3301,20 @@ err:
 
				 	DMEMIT("Error");
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
			
 
				+ * the one-past-the-end value.
			
 
				+ */
			
 
				+struct cblock_range {
			
 
				+	dm_cblock_t begin;
			
 
				+	dm_cblock_t end;
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				  * A cache block range can take two forms:
			
 
				  *
			
 
				  * i) A single cblock, eg. '3456'
			
 
				- * ii) A begin and end cblock with dots between, eg. 123-234
			
 
				+ * ii) A begin and end cblock with a dash between, eg. 123-234
			
 
				  */
			
 
				 static int parse_cblock_range(struct cache *cache, const char *str,
			
 
				 			      struct cblock_range *result)
			
@@ -3689,23 +3380,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static inline dm_cblock_t cblock_succ(dm_cblock_t b)
			
 
				+{
			
 
				+	return to_cblock(from_cblock(b) + 1);
			
 
				+}
			
 
				+
			
 
				 static int request_invalidation(struct cache *cache, struct cblock_range *range)
			
 
				 {
			
 
				-	struct invalidation_request req;
			
 
				+	int r = 0;
			
 
				 
			
 
				-	INIT_LIST_HEAD(&req.list);
			
 
				-	req.cblocks = range;
			
 
				-	atomic_set(&req.complete, 0);
			
 
				-	req.err = 0;
			
 
				-	init_waitqueue_head(&req.result_wait);
			
 
				+	/*
			
 
				+	 * We don't need to do any locking here because we know we're in
			
 
				+	 * passthrough mode.  There's is potential for a race between an
			
 
				+	 * invalidation triggered by an io and an invalidation message.  This
			
 
				+	 * is harmless, we must not worry if the policy call fails.
			
 
				+	 */
			
 
				+	while (range->begin != range->end) {
			
 
				+		r = invalidate_cblock(cache, range->begin);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				 
			
 
				-	spin_lock(&cache->invalidation_lock);
			
 
				-	list_add(&req.list, &cache->invalidation_requests);
			
 
				-	spin_unlock(&cache->invalidation_lock);
			
 
				-	wake_worker(cache);
			
 
				+		range->begin = cblock_succ(range->begin);
			
 
				+	}
			
 
				 
			
 
				-	wait_event(req.result_wait, atomic_read(&req.complete));
			
 
				-	return req.err;
			
 
				+	cache->commit_requested = true;
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
			
@@ -3815,7 +3514,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
				 
			
 
				 static struct target_type cache_target = {
			
 
				 	.name = "cache",
			
 
				-	.version = {1, 10, 0},
			
 
				+	.version = {2, 0, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = cache_ctr,
			
 
				 	.dtr = cache_dtr,
			
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -47,7 +47,7 @@ struct mapped_device {
 
				 	struct request_queue *queue;
			
 
				 	int numa_node_id;
			
 
				 
			
 
				-	unsigned type;
			
 
				+	enum dm_queue_mode type;
			
 
				 	/* Protect queue and type against concurrent access. */
			
 
				 	struct mutex type_lock;
			
 
				 
			
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
 
				 /*
			
 
				  * Copyright (C) 2003 Jana Saout <jana@saout.de>
			
 
				  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
			
 
				- * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
			
 
				- * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
			
 
				+ * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
			
 
				+ * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
			
 
				  *
			
 
				  * This file is released under the GPL.
			
 
				  */
			
@@ -31,6 +31,9 @@
 
				 #include <crypto/md5.h>
			
 
				 #include <crypto/algapi.h>
			
 
				 #include <crypto/skcipher.h>
			
 
				+#include <crypto/aead.h>
			
 
				+#include <crypto/authenc.h>
			
 
				+#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
			
 
				 #include <keys/user-type.h>
			
 
				 
			
 
				 #include <linux/device-mapper.h>
			
@@ -48,7 +51,11 @@ struct convert_context {
 
				 	struct bvec_iter iter_out;
			
 
				 	sector_t cc_sector;
			
 
				 	atomic_t cc_pending;
			
 
				-	struct skcipher_request *req;
			
 
				+	union {
			
 
				+		struct skcipher_request *req;
			
 
				+		struct aead_request *req_aead;
			
 
				+	} r;
			
 
				+
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -57,6 +64,8 @@ struct convert_context {
 
				 struct dm_crypt_io {
			
 
				 	struct crypt_config *cc;
			
 
				 	struct bio *base_bio;
			
 
				+	u8 *integrity_metadata;
			
 
				+	bool integrity_metadata_from_pool;
			
 
				 	struct work_struct work;
			
 
				 
			
 
				 	struct convert_context ctx;
			
@@ -70,8 +79,8 @@ struct dm_crypt_io {
 
				 
			
 
				 struct dm_crypt_request {
			
 
				 	struct convert_context *ctx;
			
 
				-	struct scatterlist sg_in;
			
 
				-	struct scatterlist sg_out;
			
 
				+	struct scatterlist sg_in[4];
			
 
				+	struct scatterlist sg_out[4];
			
 
				 	sector_t iv_sector;
			
 
				 };
			
 
				 
			
@@ -118,6 +127,11 @@ struct iv_tcw_private {
 
				 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
			
 
				 	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
			
 
				 
			
 
				+enum cipher_flags {
			
 
				+	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cihper */
			
 
				+	CRYPT_IV_LARGE_SECTORS,		/* Calculate IV from sector_size, not 512B sectors */
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				  * The fields in here must be read only after initialization.
			
 
				  */
			
@@ -126,11 +140,14 @@ struct crypt_config {
 
				 	sector_t start;
			
 
				 
			
 
				 	/*
			
 
				-	 * pool for per bio private data, crypto requests and
			
 
				-	 * encryption requeusts/buffer pages
			
 
				+	 * pool for per bio private data, crypto requests,
			
 
				+	 * encryption requeusts/buffer pages and integrity tags
			
 
				 	 */
			
 
				 	mempool_t *req_pool;
			
 
				 	mempool_t *page_pool;
			
 
				+	mempool_t *tag_pool;
			
 
				+	unsigned tag_pool_max_sectors;
			
 
				+
			
 
				 	struct bio_set *bs;
			
 
				 	struct mutex bio_alloc_lock;
			
 
				 
			
@@ -143,6 +160,7 @@ struct crypt_config {
 
				 
			
 
				 	char *cipher;
			
 
				 	char *cipher_string;
			
 
				+	char *cipher_auth;
			
 
				 	char *key_string;
			
 
				 
			
 
				 	const struct crypt_iv_operations *iv_gen_ops;
			
@@ -154,11 +172,17 @@ struct crypt_config {
 
				 	} iv_gen_private;
			
 
				 	sector_t iv_offset;
			
 
				 	unsigned int iv_size;
			
 
				+	unsigned short int sector_size;
			
 
				+	unsigned char sector_shift;
			
 
				 
			
 
				 	/* ESSIV: struct crypto_cipher *essiv_tfm */
			
 
				 	void *iv_private;
			
 
				-	struct crypto_skcipher **tfms;
			
 
				+	union {
			
 
				+		struct crypto_skcipher **tfms;
			
 
				+		struct crypto_aead **tfms_aead;
			
 
				+	} cipher_tfm;
			
 
				 	unsigned tfms_count;
			
 
				+	unsigned long cipher_flags;
			
 
				 
			
 
				 	/*
			
 
				 	 * Layout of each crypto request:
			
@@ -181,21 +205,36 @@ struct crypt_config {
 
				 	unsigned int key_size;
			
 
				 	unsigned int key_parts;      /* independent parts in key buffer */
			
 
				 	unsigned int key_extra_size; /* additional keys length */
			
 
				+	unsigned int key_mac_size;   /* MAC key size for authenc(...) */
			
 
				+
			
 
				+	unsigned int integrity_tag_size;
			
 
				+	unsigned int integrity_iv_size;
			
 
				+	unsigned int on_disk_tag_size;
			
 
				+
			
 
				+	u8 *authenc_key; /* space for keys in authenc() format (if used) */
			
 
				 	u8 key[0];
			
 
				 };
			
 
				 
			
 
				-#define MIN_IOS        64
			
 
				+#define MIN_IOS		64
			
 
				+#define MAX_TAG_SIZE	480
			
 
				+#define POOL_ENTRY_SIZE	512
			
 
				 
			
 
				 static void clone_init(struct dm_crypt_io *, struct bio *);
			
 
				 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
			
 
				-static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
			
 
				+static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
			
 
				+					     struct scatterlist *sg);
			
 
				 
			
 
				 /*
			
 
				- * Use this to access cipher attributes that are the same for each CPU.
			
 
				+ * Use this to access cipher attributes that are independent of the key.
			
 
				  */
			
 
				 static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
			
 
				 {
			
 
				-	return cc->tfms[0];
			
 
				+	return cc->cipher_tfm.tfms[0];
			
 
				+}
			
 
				+
			
 
				+static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
			
 
				+{
			
 
				+	return cc->cipher_tfm.tfms_aead[0];
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -310,10 +349,11 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-/* Set up per cpu cipher state */
			
 
				-static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
			
 
				-					     struct dm_target *ti,
			
 
				-					     u8 *salt, unsigned saltsize)
			
 
				+/* Allocate the cipher for ESSIV */
			
 
				+static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
			
 
				+						struct dm_target *ti,
			
 
				+						const u8 *salt,
			
 
				+						unsigned int saltsize)
			
 
				 {
			
 
				 	struct crypto_cipher *essiv_tfm;
			
 
				 	int err;
			
@@ -325,8 +365,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
 
				 		return essiv_tfm;
			
 
				 	}
			
 
				 
			
 
				-	if (crypto_cipher_blocksize(essiv_tfm) !=
			
 
				-	    crypto_skcipher_ivsize(any_tfm(cc))) {
			
 
				+	if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
			
 
				 		ti->error = "Block size of ESSIV cipher does "
			
 
				 			    "not match IV size of block cipher";
			
 
				 		crypto_free_cipher(essiv_tfm);
			
@@ -393,8 +432,8 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 
				 	cc->iv_gen_private.essiv.salt = salt;
			
 
				 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
			
 
				 
			
 
				-	essiv_tfm = setup_essiv_cpu(cc, ti, salt,
			
 
				-				crypto_ahash_digestsize(hash_tfm));
			
 
				+	essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
			
 
				+				       crypto_ahash_digestsize(hash_tfm));
			
 
				 	if (IS_ERR(essiv_tfm)) {
			
 
				 		crypt_iv_essiv_dtr(cc);
			
 
				 		return PTR_ERR(essiv_tfm);
			
@@ -488,6 +527,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
 
				 {
			
 
				 	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
			
 
				 
			
 
				+	if (cc->sector_size != (1 << SECTOR_SHIFT)) {
			
 
				+		ti->error = "Unsupported sector size for LMK";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
			
 
				 	if (IS_ERR(lmk->hash_tfm)) {
			
 
				 		ti->error = "Error initializing LMK hash";
			
@@ -585,12 +629,14 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
 
				 static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
			
 
				 			    struct dm_crypt_request *dmreq)
			
 
				 {
			
 
				+	struct scatterlist *sg;
			
 
				 	u8 *src;
			
 
				 	int r = 0;
			
 
				 
			
 
				 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
			
 
				-		src = kmap_atomic(sg_page(&dmreq->sg_in));
			
 
				-		r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
			
 
				+		sg = crypt_get_sg_data(cc, dmreq->sg_in);
			
 
				+		src = kmap_atomic(sg_page(sg));
			
 
				+		r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
			
 
				 		kunmap_atomic(src);
			
 
				 	} else
			
 
				 		memset(iv, 0, cc->iv_size);
			
@@ -601,18 +647,20 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
 
				 static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
			
 
				 			     struct dm_crypt_request *dmreq)
			
 
				 {
			
 
				+	struct scatterlist *sg;
			
 
				 	u8 *dst;
			
 
				 	int r;
			
 
				 
			
 
				 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
			
 
				 		return 0;
			
 
				 
			
 
				-	dst = kmap_atomic(sg_page(&dmreq->sg_out));
			
 
				-	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
			
 
				+	sg = crypt_get_sg_data(cc, dmreq->sg_out);
			
 
				+	dst = kmap_atomic(sg_page(sg));
			
 
				+	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
			
 
				 
			
 
				 	/* Tweak the first block of plaintext sector */
			
 
				 	if (!r)
			
 
				-		crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
			
 
				+		crypto_xor(dst + sg->offset, iv, cc->iv_size);
			
 
				 
			
 
				 	kunmap_atomic(dst);
			
 
				 	return r;
			
@@ -637,6 +685,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
 
				 {
			
 
				 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
			
 
				 
			
 
				+	if (cc->sector_size != (1 << SECTOR_SHIFT)) {
			
 
				+		ti->error = "Unsupported sector size for TCW";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
			
 
				 		ti->error = "Wrong key size for TCW";
			
 
				 		return -EINVAL;
			
@@ -724,6 +777,7 @@ out:
 
				 static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
			
 
				 			    struct dm_crypt_request *dmreq)
			
 
				 {
			
 
				+	struct scatterlist *sg;
			
 
				 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
			
 
				 	__le64 sector = cpu_to_le64(dmreq->iv_sector);
			
 
				 	u8 *src;
			
@@ -731,8 +785,9 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 
				 
			
 
				 	/* Remove whitening from ciphertext */
			
 
				 	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
			
 
				-		src = kmap_atomic(sg_page(&dmreq->sg_in));
			
 
				-		r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
			
 
				+		sg = crypt_get_sg_data(cc, dmreq->sg_in);
			
 
				+		src = kmap_atomic(sg_page(sg));
			
 
				+		r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
			
 
				 		kunmap_atomic(src);
			
 
				 	}
			
 
				 
			
@@ -748,6 +803,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 
				 static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
			
 
				 			     struct dm_crypt_request *dmreq)
			
 
				 {
			
 
				+	struct scatterlist *sg;
			
 
				 	u8 *dst;
			
 
				 	int r;
			
 
				 
			
@@ -755,13 +811,22 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
 
				 		return 0;
			
 
				 
			
 
				 	/* Apply whitening on ciphertext */
			
 
				-	dst = kmap_atomic(sg_page(&dmreq->sg_out));
			
 
				-	r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
			
 
				+	sg = crypt_get_sg_data(cc, dmreq->sg_out);
			
 
				+	dst = kmap_atomic(sg_page(sg));
			
 
				+	r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
			
 
				 	kunmap_atomic(dst);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
			
 
				+				struct dm_crypt_request *dmreq)
			
 
				+{
			
 
				+	/* Used only for writes, there must be an additional space to store IV */
			
 
				+	get_random_bytes(iv, cc->iv_size);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static const struct crypt_iv_operations crypt_iv_plain_ops = {
			
 
				 	.generator = crypt_iv_plain_gen
			
 
				 };
			
@@ -806,6 +871,108 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = {
 
				 	.post	   = crypt_iv_tcw_post
			
 
				 };
			
 
				 
			
 
				+static struct crypt_iv_operations crypt_iv_random_ops = {
			
 
				+	.generator = crypt_iv_random_gen
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Integrity extensions
			
 
				+ */
			
 
				+static bool crypt_integrity_aead(struct crypt_config *cc)
			
 
				+{
			
 
				+	return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
			
 
				+}
			
 
				+
			
 
				+static bool crypt_integrity_hmac(struct crypt_config *cc)
			
 
				+{
			
 
				+	return crypt_integrity_aead(cc) && cc->key_mac_size;
			
 
				+}
			
 
				+
			
 
				+/* Get sg containing data */
			
 
				+static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
			
 
				+					     struct scatterlist *sg)
			
 
				+{
			
 
				+	if (unlikely(crypt_integrity_aead(cc)))
			
 
				+		return &sg[2];
			
 
				+
			
 
				+	return sg;
			
 
				+}
			
 
				+
			
 
				+static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
			
 
				+{
			
 
				+	struct bio_integrity_payload *bip;
			
 
				+	unsigned int tag_len;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
			
 
				+		return 0;
			
 
				+
			
 
				+	bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
			
 
				+	if (IS_ERR(bip))
			
 
				+		return PTR_ERR(bip);
			
 
				+
			
 
				+	tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
			
 
				+
			
 
				+	bip->bip_iter.bi_size = tag_len;
			
 
				+	bip->bip_iter.bi_sector = io->cc->start + io->sector;
			
 
				+
			
 
				+	/* We own the metadata, do not let bio_free to release it */
			
 
				+	bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
			
 
				+
			
 
				+	ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
			
 
				+				     tag_len, offset_in_page(io->integrity_metadata));
			
 
				+	if (unlikely(ret != tag_len))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
			
 
				+{
			
 
				+#ifdef CONFIG_BLK_DEV_INTEGRITY
			
 
				+	struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
			
 
				+
			
 
				+	/* From now we require underlying device with our integrity profile */
			
 
				+	if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
			
 
				+		ti->error = "Integrity profile not supported.";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (bi->tag_size != cc->on_disk_tag_size ||
			
 
				+	    bi->tuple_size != cc->on_disk_tag_size) {
			
 
				+		ti->error = "Integrity profile tag size mismatch.";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	if (1 << bi->interval_exp != cc->sector_size) {
			
 
				+		ti->error = "Integrity profile sector size mismatch.";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (crypt_integrity_aead(cc)) {
			
 
				+		cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
			
 
				+		DMINFO("Integrity AEAD, tag size %u, IV size %u.",
			
 
				+		       cc->integrity_tag_size, cc->integrity_iv_size);
			
 
				+
			
 
				+		if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
			
 
				+			ti->error = "Integrity AEAD auth tag size is not supported.";
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+	} else if (cc->integrity_iv_size)
			
 
				+		DMINFO("Additional per-sector space %u bytes for IV.",
			
 
				+		       cc->integrity_iv_size);
			
 
				+
			
 
				+	if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
			
 
				+		ti->error = "Not enough space for integrity tag in the profile.";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+#else
			
 
				+	ti->error = "Integrity profile not supported.";
			
 
				+	return -EINVAL;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 static void crypt_convert_init(struct crypt_config *cc,
			
 
				 			       struct convert_context *ctx,
			
 
				 			       struct bio *bio_out, struct bio *bio_in,
			
@@ -822,58 +989,217 @@ static void crypt_convert_init(struct crypt_config *cc,
 
				 }
			
 
				 
			
 
				 static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
			
 
				-					     struct skcipher_request *req)
			
 
				+					     void *req)
			
 
				 {
			
 
				 	return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
			
 
				 }
			
 
				 
			
 
				-static struct skcipher_request *req_of_dmreq(struct crypt_config *cc,
			
 
				-					       struct dm_crypt_request *dmreq)
			
 
				+static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq)
			
 
				 {
			
 
				-	return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start);
			
 
				+	return (void *)((char *)dmreq - cc->dmreq_start);
			
 
				 }
			
 
				 
			
 
				 static u8 *iv_of_dmreq(struct crypt_config *cc,
			
 
				 		       struct dm_crypt_request *dmreq)
			
 
				 {
			
 
				-	return (u8 *)ALIGN((unsigned long)(dmreq + 1),
			
 
				-		crypto_skcipher_alignmask(any_tfm(cc)) + 1);
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		return (u8 *)ALIGN((unsigned long)(dmreq + 1),
			
 
				+			crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
			
 
				+	else
			
 
				+		return (u8 *)ALIGN((unsigned long)(dmreq + 1),
			
 
				+			crypto_skcipher_alignmask(any_tfm(cc)) + 1);
			
 
				 }
			
 
				 
			
 
				-static int crypt_convert_block(struct crypt_config *cc,
			
 
				-			       struct convert_context *ctx,
			
 
				-			       struct skcipher_request *req)
			
 
				+static u8 *org_iv_of_dmreq(struct crypt_config *cc,
			
 
				+		       struct dm_crypt_request *dmreq)
			
 
				+{
			
 
				+	return iv_of_dmreq(cc, dmreq) + cc->iv_size;
			
 
				+}
			
 
				+
			
 
				+static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
			
 
				+		       struct dm_crypt_request *dmreq)
			
 
				+{
			
 
				+	u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
			
 
				+	return (uint64_t*) ptr;
			
 
				+}
			
 
				+
			
 
				+static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
			
 
				+		       struct dm_crypt_request *dmreq)
			
 
				+{
			
 
				+	u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size +
			
 
				+		  cc->iv_size + sizeof(uint64_t);
			
 
				+	return (unsigned int*)ptr;
			
 
				+}
			
 
				+
			
 
				+static void *tag_from_dmreq(struct crypt_config *cc,
			
 
				+				struct dm_crypt_request *dmreq)
			
 
				+{
			
 
				+	struct convert_context *ctx = dmreq->ctx;
			
 
				+	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
			
 
				+
			
 
				+	return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
			
 
				+		cc->on_disk_tag_size];
			
 
				+}
			
 
				+
			
 
				+static void *iv_tag_from_dmreq(struct crypt_config *cc,
			
 
				+			       struct dm_crypt_request *dmreq)
			
 
				+{
			
 
				+	return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size;
			
 
				+}
			
 
				+
			
 
				+static int crypt_convert_block_aead(struct crypt_config *cc,
			
 
				+				     struct convert_context *ctx,
			
 
				+				     struct aead_request *req,
			
 
				+				     unsigned int tag_offset)
			
 
				 {
			
 
				 	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
			
 
				 	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
			
 
				 	struct dm_crypt_request *dmreq;
			
 
				-	u8 *iv;
			
 
				-	int r;
			
 
				+	u8 *iv, *org_iv, *tag_iv, *tag;
			
 
				+	uint64_t *sector;
			
 
				+	int r = 0;
			
 
				+
			
 
				+	BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
			
 
				+
			
 
				+	/* Reject unexpected unaligned bio. */
			
 
				+	if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
			
 
				+		return -EIO;
			
 
				 
			
 
				 	dmreq = dmreq_of_req(cc, req);
			
 
				+	dmreq->iv_sector = ctx->cc_sector;
			
 
				+	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
			
 
				+		dmreq->iv_sector >>= cc->sector_shift;
			
 
				+	dmreq->ctx = ctx;
			
 
				+
			
 
				+	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
			
 
				+
			
 
				+	sector = org_sector_of_dmreq(cc, dmreq);
			
 
				+	*sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
			
 
				+
			
 
				 	iv = iv_of_dmreq(cc, dmreq);
			
 
				+	org_iv = org_iv_of_dmreq(cc, dmreq);
			
 
				+	tag = tag_from_dmreq(cc, dmreq);
			
 
				+	tag_iv = iv_tag_from_dmreq(cc, dmreq);
			
 
				+
			
 
				+	/* AEAD request:
			
 
				+	 *  |----- AAD -------|------ DATA -------|-- AUTH TAG --|
			
 
				+	 *  | (authenticated) | (auth+encryption) |              |
			
 
				+	 *  | sector_LE |  IV |  sector in/out    |  tag in/out  |
			
 
				+	 */
			
 
				+	sg_init_table(dmreq->sg_in, 4);
			
 
				+	sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
			
 
				+	sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
			
 
				+	sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
			
 
				+	sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
			
 
				+
			
 
				+	sg_init_table(dmreq->sg_out, 4);
			
 
				+	sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
			
 
				+	sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
			
 
				+	sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
			
 
				+	sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
			
 
				+
			
 
				+	if (cc->iv_gen_ops) {
			
 
				+		/* For READs use IV stored in integrity metadata */
			
 
				+		if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
			
 
				+			memcpy(org_iv, tag_iv, cc->iv_size);
			
 
				+		} else {
			
 
				+			r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
			
 
				+			if (r < 0)
			
 
				+				return r;
			
 
				+			/* Store generated IV in integrity metadata */
			
 
				+			if (cc->integrity_iv_size)
			
 
				+				memcpy(tag_iv, org_iv, cc->iv_size);
			
 
				+		}
			
 
				+		/* Working copy of IV, to be modified in crypto API */
			
 
				+		memcpy(iv, org_iv, cc->iv_size);
			
 
				+	}
			
 
				+
			
 
				+	aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
			
 
				+	if (bio_data_dir(ctx->bio_in) == WRITE) {
			
 
				+		aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
			
 
				+				       cc->sector_size, iv);
			
 
				+		r = crypto_aead_encrypt(req);
			
 
				+		if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
			
 
				+			memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
			
 
				+			       cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
			
 
				+	} else {
			
 
				+		aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
			
 
				+				       cc->sector_size + cc->integrity_tag_size, iv);
			
 
				+		r = crypto_aead_decrypt(req);
			
 
				+	}
			
 
				+
			
 
				+	if (r == -EBADMSG)
			
 
				+		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
			
 
				+			    (unsigned long long)le64_to_cpu(*sector));
			
 
				+
			
 
				+	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
			
 
				+		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
			
 
				+
			
 
				+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
			
 
				+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int crypt_convert_block_skcipher(struct crypt_config *cc,
			
 
				+					struct convert_context *ctx,
			
 
				+					struct skcipher_request *req,
			
 
				+					unsigned int tag_offset)
			
 
				+{
			
 
				+	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
			
 
				+	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
			
 
				+	struct scatterlist *sg_in, *sg_out;
			
 
				+	struct dm_crypt_request *dmreq;
			
 
				+	u8 *iv, *org_iv, *tag_iv;
			
 
				+	uint64_t *sector;
			
 
				+	int r = 0;
			
 
				 
			
 
				+	/* Reject unexpected unaligned bio. */
			
 
				+	if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	dmreq = dmreq_of_req(cc, req);
			
 
				 	dmreq->iv_sector = ctx->cc_sector;
			
 
				+	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
			
 
				+		dmreq->iv_sector >>= cc->sector_shift;
			
 
				 	dmreq->ctx = ctx;
			
 
				-	sg_init_table(&dmreq->sg_in, 1);
			
 
				-	sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
			
 
				-		    bv_in.bv_offset);
			
 
				 
			
 
				-	sg_init_table(&dmreq->sg_out, 1);
			
 
				-	sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
			
 
				-		    bv_out.bv_offset);
			
 
				+	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
			
 
				+
			
 
				+	iv = iv_of_dmreq(cc, dmreq);
			
 
				+	org_iv = org_iv_of_dmreq(cc, dmreq);
			
 
				+	tag_iv = iv_tag_from_dmreq(cc, dmreq);
			
 
				+
			
 
				+	sector = org_sector_of_dmreq(cc, dmreq);
			
 
				+	*sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
			
 
				+
			
 
				+	/* For skcipher we use only the first sg item */
			
 
				+	sg_in  = &dmreq->sg_in[0];
			
 
				+	sg_out = &dmreq->sg_out[0];
			
 
				 
			
 
				-	bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
			
 
				-	bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
			
 
				+	sg_init_table(sg_in, 1);
			
 
				+	sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
			
 
				+
			
 
				+	sg_init_table(sg_out, 1);
			
 
				+	sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
			
 
				 
			
 
				 	if (cc->iv_gen_ops) {
			
 
				-		r = cc->iv_gen_ops->generator(cc, iv, dmreq);
			
 
				-		if (r < 0)
			
 
				-			return r;
			
 
				+		/* For READs use IV stored in integrity metadata */
			
 
				+		if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
			
 
				+			memcpy(org_iv, tag_iv, cc->integrity_iv_size);
			
 
				+		} else {
			
 
				+			r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
			
 
				+			if (r < 0)
			
 
				+				return r;
			
 
				+			/* Store generated IV in integrity metadata */
			
 
				+			if (cc->integrity_iv_size)
			
 
				+				memcpy(tag_iv, org_iv, cc->integrity_iv_size);
			
 
				+		}
			
 
				+		/* Working copy of IV, to be modified in crypto API */
			
 
				+		memcpy(iv, org_iv, cc->iv_size);
			
 
				 	}
			
 
				 
			
 
				-	skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
			
 
				-				   1 << SECTOR_SHIFT, iv);
			
 
				+	skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv);
			
 
				 
			
 
				 	if (bio_data_dir(ctx->bio_in) == WRITE)
			
 
				 		r = crypto_skcipher_encrypt(req);
			
@@ -881,7 +1207,10 @@ static int crypt_convert_block(struct crypt_config *cc,
 
				 		r = crypto_skcipher_decrypt(req);
			
 
				 
			
 
				 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
			
 
				-		r = cc->iv_gen_ops->post(cc, iv, dmreq);
			
 
				+		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
			
 
				+
			
 
				+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
			
 
				+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -889,27 +1218,53 @@ static int crypt_convert_block(struct crypt_config *cc,
 
				 static void kcryptd_async_done(struct crypto_async_request *async_req,
			
 
				 			       int error);
			
 
				 
			
 
				-static void crypt_alloc_req(struct crypt_config *cc,
			
 
				-			    struct convert_context *ctx)
			
 
				+static void crypt_alloc_req_skcipher(struct crypt_config *cc,
			
 
				+				     struct convert_context *ctx)
			
 
				 {
			
 
				 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
			
 
				 
			
 
				-	if (!ctx->req)
			
 
				-		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
			
 
				+	if (!ctx->r.req)
			
 
				+		ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
			
 
				 
			
 
				-	skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
			
 
				+	skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
			
 
				 
			
 
				 	/*
			
 
				 	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
			
 
				 	 * requests if driver request queue is full.
			
 
				 	 */
			
 
				-	skcipher_request_set_callback(ctx->req,
			
 
				+	skcipher_request_set_callback(ctx->r.req,
			
 
				 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
			
 
				-	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
			
 
				+	    kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
			
 
				 }
			
 
				 
			
 
				-static void crypt_free_req(struct crypt_config *cc,
			
 
				-			   struct skcipher_request *req, struct bio *base_bio)
			
 
				+static void crypt_alloc_req_aead(struct crypt_config *cc,
			
 
				+				 struct convert_context *ctx)
			
 
				+{
			
 
				+	if (!ctx->r.req_aead)
			
 
				+		ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
			
 
				+
			
 
				+	aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
			
 
				+
			
 
				+	/*
			
 
				+	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
			
 
				+	 * requests if driver request queue is full.
			
 
				+	 */
			
 
				+	aead_request_set_callback(ctx->r.req_aead,
			
 
				+	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
			
 
				+	    kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
			
 
				+}
			
 
				+
			
 
				+static void crypt_alloc_req(struct crypt_config *cc,
			
 
				+			    struct convert_context *ctx)
			
 
				+{
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		crypt_alloc_req_aead(cc, ctx);
			
 
				+	else
			
 
				+		crypt_alloc_req_skcipher(cc, ctx);
			
 
				+}
			
 
				+
			
 
				+static void crypt_free_req_skcipher(struct crypt_config *cc,
			
 
				+				    struct skcipher_request *req, struct bio *base_bio)
			
 
				 {
			
 
				 	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
			
 
				 
			
@@ -917,12 +1272,31 @@ static void crypt_free_req(struct crypt_config *cc,
 
				 		mempool_free(req, cc->req_pool);
			
 
				 }
			
 
				 
			
 
				+static void crypt_free_req_aead(struct crypt_config *cc,
			
 
				+				struct aead_request *req, struct bio *base_bio)
			
 
				+{
			
 
				+	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
			
 
				+
			
 
				+	if ((struct aead_request *)(io + 1) != req)
			
 
				+		mempool_free(req, cc->req_pool);
			
 
				+}
			
 
				+
			
 
				+static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
			
 
				+{
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		crypt_free_req_aead(cc, req, base_bio);
			
 
				+	else
			
 
				+		crypt_free_req_skcipher(cc, req, base_bio);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Encrypt / decrypt data from one bio to another one (can be the same one)
			
 
				  */
			
 
				 static int crypt_convert(struct crypt_config *cc,
			
 
				 			 struct convert_context *ctx)
			
 
				 {
			
 
				+	unsigned int tag_offset = 0;
			
 
				+	unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
			
 
				 	int r;
			
 
				 
			
 
				 	atomic_set(&ctx->cc_pending, 1);
			
@@ -930,10 +1304,12 @@ static int crypt_convert(struct crypt_config *cc,
 
				 	while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
			
 
				 
			
 
				 		crypt_alloc_req(cc, ctx);
			
 
				-
			
 
				 		atomic_inc(&ctx->cc_pending);
			
 
				 
			
 
				-		r = crypt_convert_block(cc, ctx, ctx->req);
			
 
				+		if (crypt_integrity_aead(cc))
			
 
				+			r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
			
 
				+		else
			
 
				+			r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
			
 
				 
			
 
				 		switch (r) {
			
 
				 		/*
			
@@ -949,22 +1325,31 @@ static int crypt_convert(struct crypt_config *cc,
 
				 		 * completion function kcryptd_async_done() will be called.
			
 
				 		 */
			
 
				 		case -EINPROGRESS:
			
 
				-			ctx->req = NULL;
			
 
				-			ctx->cc_sector++;
			
 
				+			ctx->r.req = NULL;
			
 
				+			ctx->cc_sector += sector_step;
			
 
				+			tag_offset++;
			
 
				 			continue;
			
 
				 		/*
			
 
				 		 * The request was already processed (synchronously).
			
 
				 		 */
			
 
				 		case 0:
			
 
				 			atomic_dec(&ctx->cc_pending);
			
 
				-			ctx->cc_sector++;
			
 
				+			ctx->cc_sector += sector_step;
			
 
				+			tag_offset++;
			
 
				 			cond_resched();
			
 
				 			continue;
			
 
				-
			
 
				-		/* There was an error while processing the request. */
			
 
				+		/*
			
 
				+		 * There was a data integrity error.
			
 
				+		 */
			
 
				+		case -EBADMSG:
			
 
				+			atomic_dec(&ctx->cc_pending);
			
 
				+			return -EILSEQ;
			
 
				+		/*
			
 
				+		 * There was an error while processing the request.
			
 
				+		 */
			
 
				 		default:
			
 
				 			atomic_dec(&ctx->cc_pending);
			
 
				-			return r;
			
 
				+			return -EIO;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1005,7 +1390,7 @@ retry:
 
				 
			
 
				 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
			
 
				 	if (!clone)
			
 
				-		goto return_clone;
			
 
				+		goto out;
			
 
				 
			
 
				 	clone_init(io, clone);
			
 
				 
			
@@ -1027,7 +1412,13 @@ retry:
 
				 		remaining_size -= len;
			
 
				 	}
			
 
				 
			
 
				-return_clone:
			
 
				+	/* Allocate space for integrity tags */
			
 
				+	if (dm_crypt_integrity_io_alloc(io, clone)) {
			
 
				+		crypt_free_buffer_pages(cc, clone);
			
 
				+		bio_put(clone);
			
 
				+		clone = NULL;
			
 
				+	}
			
 
				+out:
			
 
				 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
			
 
				 		mutex_unlock(&cc->bio_alloc_lock);
			
 
				 
			
@@ -1053,7 +1444,9 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
 
				 	io->base_bio = bio;
			
 
				 	io->sector = sector;
			
 
				 	io->error = 0;
			
 
				-	io->ctx.req = NULL;
			
 
				+	io->ctx.r.req = NULL;
			
 
				+	io->integrity_metadata = NULL;
			
 
				+	io->integrity_metadata_from_pool = false;
			
 
				 	atomic_set(&io->io_pending, 0);
			
 
				 }
			
 
				 
			
@@ -1075,8 +1468,13 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 
				 	if (!atomic_dec_and_test(&io->io_pending))
			
 
				 		return;
			
 
				 
			
 
				-	if (io->ctx.req)
			
 
				-		crypt_free_req(cc, io->ctx.req, base_bio);
			
 
				+	if (io->ctx.r.req)
			
 
				+		crypt_free_req(cc, io->ctx.r.req, base_bio);
			
 
				+
			
 
				+	if (unlikely(io->integrity_metadata_from_pool))
			
 
				+		mempool_free(io->integrity_metadata, io->cc->tag_pool);
			
 
				+	else
			
 
				+		kfree(io->integrity_metadata);
			
 
				 
			
 
				 	base_bio->bi_error = error;
			
 
				 	bio_endio(base_bio);
			
@@ -1156,6 +1554,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 
				 	clone_init(io, clone);
			
 
				 	clone->bi_iter.bi_sector = cc->start + io->sector;
			
 
				 
			
 
				+	if (dm_crypt_integrity_io_alloc(io, clone)) {
			
 
				+		crypt_dec_pending(io);
			
 
				+		bio_put(clone);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				 	generic_make_request(clone);
			
 
				 	return 0;
			
 
				 }
			
@@ -1314,8 +1718,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
				 
			
 
				 	crypt_inc_pending(io);
			
 
				 	r = crypt_convert(cc, &io->ctx);
			
 
				-	if (r)
			
 
				-		io->error = -EIO;
			
 
				+	if (r < 0)
			
 
				+		io->error = r;
			
 
				 	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
			
 
				 
			
 
				 	/* Encryption was already finished, submit io now */
			
@@ -1345,7 +1749,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 
				 
			
 
				 	r = crypt_convert(cc, &io->ctx);
			
 
				 	if (r < 0)
			
 
				-		io->error = -EIO;
			
 
				+		io->error = r;
			
 
				 
			
 
				 	if (atomic_dec_and_test(&io->ctx.cc_pending))
			
 
				 		kcryptd_crypt_read_done(io);
			
@@ -1372,9 +1776,13 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 
				 	}
			
 
				 
			
 
				 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
			
 
				-		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
			
 
				+		error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
			
 
				 
			
 
				-	if (error < 0)
			
 
				+	if (error == -EBADMSG) {
			
 
				+		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
			
 
				+			    (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
			
 
				+		io->error = -EILSEQ;
			
 
				+	} else if (error < 0)
			
 
				 		io->error = -EIO;
			
 
				 
			
 
				 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
			
@@ -1406,61 +1814,59 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 
				 	queue_work(cc->crypt_queue, &io->work);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Decode key from its hex representation
			
 
				- */
			
 
				-static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
			
 
				+static void crypt_free_tfms_aead(struct crypt_config *cc)
			
 
				 {
			
 
				-	char buffer[3];
			
 
				-	unsigned int i;
			
 
				-
			
 
				-	buffer[2] = '\0';
			
 
				-
			
 
				-	for (i = 0; i < size; i++) {
			
 
				-		buffer[0] = *hex++;
			
 
				-		buffer[1] = *hex++;
			
 
				+	if (!cc->cipher_tfm.tfms_aead)
			
 
				+		return;
			
 
				 
			
 
				-		if (kstrtou8(buffer, 16, &key[i]))
			
 
				-			return -EINVAL;
			
 
				+	if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
			
 
				+		crypto_free_aead(cc->cipher_tfm.tfms_aead[0]);
			
 
				+		cc->cipher_tfm.tfms_aead[0] = NULL;
			
 
				 	}
			
 
				 
			
 
				-	if (*hex != '\0')
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	return 0;
			
 
				+	kfree(cc->cipher_tfm.tfms_aead);
			
 
				+	cc->cipher_tfm.tfms_aead = NULL;
			
 
				 }
			
 
				 
			
 
				-static void crypt_free_tfms(struct crypt_config *cc)
			
 
				+static void crypt_free_tfms_skcipher(struct crypt_config *cc)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 
			
 
				-	if (!cc->tfms)
			
 
				+	if (!cc->cipher_tfm.tfms)
			
 
				 		return;
			
 
				 
			
 
				 	for (i = 0; i < cc->tfms_count; i++)
			
 
				-		if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
			
 
				-			crypto_free_skcipher(cc->tfms[i]);
			
 
				-			cc->tfms[i] = NULL;
			
 
				+		if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) {
			
 
				+			crypto_free_skcipher(cc->cipher_tfm.tfms[i]);
			
 
				+			cc->cipher_tfm.tfms[i] = NULL;
			
 
				 		}
			
 
				 
			
 
				-	kfree(cc->tfms);
			
 
				-	cc->tfms = NULL;
			
 
				+	kfree(cc->cipher_tfm.tfms);
			
 
				+	cc->cipher_tfm.tfms = NULL;
			
 
				 }
			
 
				 
			
 
				-static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
			
 
				+static void crypt_free_tfms(struct crypt_config *cc)
			
 
				+{
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		crypt_free_tfms_aead(cc);
			
 
				+	else
			
 
				+		crypt_free_tfms_skcipher(cc);
			
 
				+}
			
 
				+
			
 
				+static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	int err;
			
 
				 
			
 
				-	cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
			
 
				-			   GFP_KERNEL);
			
 
				-	if (!cc->tfms)
			
 
				+	cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
			
 
				+				      sizeof(struct crypto_skcipher *), GFP_KERNEL);
			
 
				+	if (!cc->cipher_tfm.tfms)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	for (i = 0; i < cc->tfms_count; i++) {
			
 
				-		cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
			
 
				-		if (IS_ERR(cc->tfms[i])) {
			
 
				-			err = PTR_ERR(cc->tfms[i]);
			
 
				+		cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
			
 
				+		if (IS_ERR(cc->cipher_tfm.tfms[i])) {
			
 
				+			err = PTR_ERR(cc->cipher_tfm.tfms[i]);
			
 
				 			crypt_free_tfms(cc);
			
 
				 			return err;
			
 
				 		}
			
@@ -1469,22 +1875,95 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
			
 
				+	if (!cc->cipher_tfm.tfms)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
			
 
				+	if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
			
 
				+		err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
			
 
				+		crypt_free_tfms(cc);
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
			
 
				+{
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		return crypt_alloc_tfms_aead(cc, ciphermode);
			
 
				+	else
			
 
				+		return crypt_alloc_tfms_skcipher(cc, ciphermode);
			
 
				+}
			
 
				+
			
 
				+static unsigned crypt_subkey_size(struct crypt_config *cc)
			
 
				+{
			
 
				+	return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
			
 
				+}
			
 
				+
			
 
				+static unsigned crypt_authenckey_size(struct crypt_config *cc)
			
 
				+{
			
 
				+	return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If AEAD is composed like authenc(hmac(sha256),xts(aes)),
			
 
				+ * the key must be for some reason in special format.
			
 
				+ * This funcion converts cc->key to this special format.
			
 
				+ */
			
 
				+static void crypt_copy_authenckey(char *p, const void *key,
			
 
				+				  unsigned enckeylen, unsigned authkeylen)
			
 
				+{
			
 
				+	struct crypto_authenc_key_param *param;
			
 
				+	struct rtattr *rta;
			
 
				+
			
 
				+	rta = (struct rtattr *)p;
			
 
				+	param = RTA_DATA(rta);
			
 
				+	param->enckeylen = cpu_to_be32(enckeylen);
			
 
				+	rta->rta_len = RTA_LENGTH(sizeof(*param));
			
 
				+	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
			
 
				+	p += RTA_SPACE(sizeof(*param));
			
 
				+	memcpy(p, key + enckeylen, authkeylen);
			
 
				+	p += authkeylen;
			
 
				+	memcpy(p, key, enckeylen);
			
 
				+}
			
 
				+
			
 
				 static int crypt_setkey(struct crypt_config *cc)
			
 
				 {
			
 
				 	unsigned subkey_size;
			
 
				 	int err = 0, i, r;
			
 
				 
			
 
				 	/* Ignore extra keys (which are used for IV etc) */
			
 
				-	subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
			
 
				+	subkey_size = crypt_subkey_size(cc);
			
 
				 
			
 
				+	if (crypt_integrity_hmac(cc))
			
 
				+		crypt_copy_authenckey(cc->authenc_key, cc->key,
			
 
				+				      subkey_size - cc->key_mac_size,
			
 
				+				      cc->key_mac_size);
			
 
				 	for (i = 0; i < cc->tfms_count; i++) {
			
 
				-		r = crypto_skcipher_setkey(cc->tfms[i],
			
 
				-					   cc->key + (i * subkey_size),
			
 
				-					   subkey_size);
			
 
				+		if (crypt_integrity_hmac(cc))
			
 
				+			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
			
 
				+				cc->authenc_key, crypt_authenckey_size(cc));
			
 
				+		else if (crypt_integrity_aead(cc))
			
 
				+			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
			
 
				+					       cc->key + (i * subkey_size),
			
 
				+					       subkey_size);
			
 
				+		else
			
 
				+			r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
			
 
				+						   cc->key + (i * subkey_size),
			
 
				+						   subkey_size);
			
 
				 		if (r)
			
 
				 			err = r;
			
 
				 	}
			
 
				 
			
 
				+	if (crypt_integrity_hmac(cc))
			
 
				+		memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc));
			
 
				+
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -1633,7 +2112,8 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
				 	kzfree(cc->key_string);
			
 
				 	cc->key_string = NULL;
			
 
				 
			
 
				-	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
			
 
				+	/* Decode key from its hex representation. */
			
 
				+	if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0)
			
 
				 		goto out;
			
 
				 
			
 
				 	r = crypt_setkey(cc);
			
@@ -1649,12 +2129,16 @@ out:
 
				 
			
 
				 static int crypt_wipe_key(struct crypt_config *cc)
			
 
				 {
			
 
				+	int r;
			
 
				+
			
 
				 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
			
 
				-	memset(&cc->key, 0, cc->key_size * sizeof(u8));
			
 
				+	get_random_bytes(&cc->key, cc->key_size);
			
 
				 	kzfree(cc->key_string);
			
 
				 	cc->key_string = NULL;
			
 
				+	r = crypt_setkey(cc);
			
 
				+	memset(&cc->key, 0, cc->key_size * sizeof(u8));
			
 
				 
			
 
				-	return crypt_setkey(cc);
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				 static void crypt_dtr(struct dm_target *ti)
			
@@ -1681,6 +2165,7 @@ static void crypt_dtr(struct dm_target *ti)
 
				 
			
 
				 	mempool_destroy(cc->page_pool);
			
 
				 	mempool_destroy(cc->req_pool);
			
 
				+	mempool_destroy(cc->tag_pool);
			
 
				 
			
 
				 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
			
 
				 		cc->iv_gen_ops->dtr(cc);
			
@@ -1691,30 +2176,221 @@ static void crypt_dtr(struct dm_target *ti)
 
				 	kzfree(cc->cipher);
			
 
				 	kzfree(cc->cipher_string);
			
 
				 	kzfree(cc->key_string);
			
 
				+	kzfree(cc->cipher_auth);
			
 
				+	kzfree(cc->authenc_key);
			
 
				 
			
 
				 	/* Must zero key material before freeing */
			
 
				 	kzfree(cc);
			
 
				 }
			
 
				 
			
 
				-static int crypt_ctr_cipher(struct dm_target *ti,
			
 
				-			    char *cipher_in, char *key)
			
 
				+static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
			
 
				+{
			
 
				+	struct crypt_config *cc = ti->private;
			
 
				+
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
			
 
				+	else
			
 
				+		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
			
 
				+
			
 
				+	if (cc->iv_size)
			
 
				+		/* at least a 64 bit sector number should fit in our buffer */
			
 
				+		cc->iv_size = max(cc->iv_size,
			
 
				+				  (unsigned int)(sizeof(u64) / sizeof(u8)));
			
 
				+	else if (ivmode) {
			
 
				+		DMWARN("Selected cipher does not support IVs");
			
 
				+		ivmode = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/* Choose ivmode, see comments at iv code. */
			
 
				+	if (ivmode == NULL)
			
 
				+		cc->iv_gen_ops = NULL;
			
 
				+	else if (strcmp(ivmode, "plain") == 0)
			
 
				+		cc->iv_gen_ops = &crypt_iv_plain_ops;
			
 
				+	else if (strcmp(ivmode, "plain64") == 0)
			
 
				+		cc->iv_gen_ops = &crypt_iv_plain64_ops;
			
 
				+	else if (strcmp(ivmode, "essiv") == 0)
			
 
				+		cc->iv_gen_ops = &crypt_iv_essiv_ops;
			
 
				+	else if (strcmp(ivmode, "benbi") == 0)
			
 
				+		cc->iv_gen_ops = &crypt_iv_benbi_ops;
			
 
				+	else if (strcmp(ivmode, "null") == 0)
			
 
				+		cc->iv_gen_ops = &crypt_iv_null_ops;
			
 
				+	else if (strcmp(ivmode, "lmk") == 0) {
			
 
				+		cc->iv_gen_ops = &crypt_iv_lmk_ops;
			
 
				+		/*
			
 
				+		 * Version 2 and 3 is recognised according
			
 
				+		 * to length of provided multi-key string.
			
 
				+		 * If present (version 3), last key is used as IV seed.
			
 
				+		 * All keys (including IV seed) are always the same size.
			
 
				+		 */
			
 
				+		if (cc->key_size % cc->key_parts) {
			
 
				+			cc->key_parts++;
			
 
				+			cc->key_extra_size = cc->key_size / cc->key_parts;
			
 
				+		}
			
 
				+	} else if (strcmp(ivmode, "tcw") == 0) {
			
 
				+		cc->iv_gen_ops = &crypt_iv_tcw_ops;
			
 
				+		cc->key_parts += 2; /* IV + whitening */
			
 
				+		cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
			
 
				+	} else if (strcmp(ivmode, "random") == 0) {
			
 
				+		cc->iv_gen_ops = &crypt_iv_random_ops;
			
 
				+		/* Need storage space in integrity fields. */
			
 
				+		cc->integrity_iv_size = cc->iv_size;
			
 
				+	} else {
			
 
				+		ti->error = "Invalid IV mode";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Workaround to parse cipher algorithm from crypto API spec.
			
 
				+ * The cc->cipher is currently used only in ESSIV.
			
 
				+ * This should be probably done by crypto-api calls (once available...)
			
 
				+ */
			
 
				+static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
			
 
				+{
			
 
				+	const char *alg_name = NULL;
			
 
				+	char *start, *end;
			
 
				+
			
 
				+	if (crypt_integrity_aead(cc)) {
			
 
				+		alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
			
 
				+		if (!alg_name)
			
 
				+			return -EINVAL;
			
 
				+		if (crypt_integrity_hmac(cc)) {
			
 
				+			alg_name = strchr(alg_name, ',');
			
 
				+			if (!alg_name)
			
 
				+				return -EINVAL;
			
 
				+		}
			
 
				+		alg_name++;
			
 
				+	} else {
			
 
				+		alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
			
 
				+		if (!alg_name)
			
 
				+			return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	start = strchr(alg_name, '(');
			
 
				+	end = strchr(alg_name, ')');
			
 
				+
			
 
				+	if (!start && !end) {
			
 
				+		cc->cipher = kstrdup(alg_name, GFP_KERNEL);
			
 
				+		return cc->cipher ? 0 : -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	if (!start || !end || ++start >= end)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
			
 
				+	if (!cc->cipher)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	strncpy(cc->cipher, start, end - start);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Workaround to parse HMAC algorithm from AEAD crypto API spec.
			
 
				+ * The HMAC is needed to calculate tag size (HMAC digest size).
			
 
				+ * This should be probably done by crypto-api calls (once available...)
			
 
				+ */
			
 
				+static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
			
 
				+{
			
 
				+	char *start, *end, *mac_alg = NULL;
			
 
				+	struct crypto_ahash *mac;
			
 
				+
			
 
				+	if (!strstarts(cipher_api, "authenc("))
			
 
				+		return 0;
			
 
				+
			
 
				+	start = strchr(cipher_api, '(');
			
 
				+	end = strchr(cipher_api, ',');
			
 
				+	if (!start || !end || ++start > end)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
			
 
				+	if (!mac_alg)
			
 
				+		return -ENOMEM;
			
 
				+	strncpy(mac_alg, start, end - start);
			
 
				+
			
 
				+	mac = crypto_alloc_ahash(mac_alg, 0, 0);
			
 
				+	kfree(mac_alg);
			
 
				+
			
 
				+	if (IS_ERR(mac))
			
 
				+		return PTR_ERR(mac);
			
 
				+
			
 
				+	cc->key_mac_size = crypto_ahash_digestsize(mac);
			
 
				+	crypto_free_ahash(mac);
			
 
				+
			
 
				+	cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
			
 
				+	if (!cc->authenc_key)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key,
			
 
				+				char **ivmode, char **ivopts)
			
 
				+{
			
 
				+	struct crypt_config *cc = ti->private;
			
 
				+	char *tmp, *cipher_api;
			
 
				+	int ret = -EINVAL;
			
 
				+
			
 
				+	cc->tfms_count = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * New format (capi: prefix)
			
 
				+	 * capi:cipher_api_spec-iv:ivopts
			
 
				+	 */
			
 
				+	tmp = &cipher_in[strlen("capi:")];
			
 
				+	cipher_api = strsep(&tmp, "-");
			
 
				+	*ivmode = strsep(&tmp, ":");
			
 
				+	*ivopts = tmp;
			
 
				+
			
 
				+	if (*ivmode && !strcmp(*ivmode, "lmk"))
			
 
				+		cc->tfms_count = 64;
			
 
				+
			
 
				+	cc->key_parts = cc->tfms_count;
			
 
				+
			
 
				+	/* Allocate cipher */
			
 
				+	ret = crypt_alloc_tfms(cc, cipher_api);
			
 
				+	if (ret < 0) {
			
 
				+		ti->error = "Error allocating crypto tfm";
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	/* Alloc AEAD, can be used only in new format. */
			
 
				+	if (crypt_integrity_aead(cc)) {
			
 
				+		ret = crypt_ctr_auth_cipher(cc, cipher_api);
			
 
				+		if (ret < 0) {
			
 
				+			ti->error = "Invalid AEAD cipher spec";
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
			
 
				+	} else
			
 
				+		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
			
 
				+
			
 
				+	ret = crypt_ctr_blkdev_cipher(cc);
			
 
				+	if (ret < 0) {
			
 
				+		ti->error = "Cannot allocate cipher string";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key,
			
 
				+				char **ivmode, char **ivopts)
			
 
				 {
			
 
				 	struct crypt_config *cc = ti->private;
			
 
				-	char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
			
 
				+	char *tmp, *cipher, *chainmode, *keycount;
			
 
				 	char *cipher_api = NULL;
			
 
				 	int ret = -EINVAL;
			
 
				 	char dummy;
			
 
				 
			
 
				-	/* Convert to crypto api definition? */
			
 
				-	if (strchr(cipher_in, '(')) {
			
 
				+	if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) {
			
 
				 		ti->error = "Bad cipher specification";
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
			
 
				-	if (!cc->cipher_string)
			
 
				-		goto bad_mem;
			
 
				-
			
 
				 	/*
			
 
				 	 * Legacy dm-crypt cipher specification
			
 
				 	 * cipher[:keycount]-mode-iv:ivopts
			
@@ -1731,15 +2407,14 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 	cc->key_parts = cc->tfms_count;
			
 
				-	cc->key_extra_size = 0;
			
 
				 
			
 
				 	cc->cipher = kstrdup(cipher, GFP_KERNEL);
			
 
				 	if (!cc->cipher)
			
 
				 		goto bad_mem;
			
 
				 
			
 
				 	chainmode = strsep(&tmp, "-");
			
 
				-	ivopts = strsep(&tmp, "-");
			
 
				-	ivmode = strsep(&ivopts, ":");
			
 
				+	*ivopts = strsep(&tmp, "-");
			
 
				+	*ivmode = strsep(&*ivopts, ":");
			
 
				 
			
 
				 	if (tmp)
			
 
				 		DMWARN("Ignoring unexpected additional cipher options");
			
@@ -1748,12 +2423,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
				 	 * For compatibility with the original dm-crypt mapping format, if
			
 
				 	 * only the cipher name is supplied, use cbc-plain.
			
 
				 	 */
			
 
				-	if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
			
 
				+	if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) {
			
 
				 		chainmode = "cbc";
			
 
				-		ivmode = "plain";
			
 
				+		*ivmode = "plain";
			
 
				 	}
			
 
				 
			
 
				-	if (strcmp(chainmode, "ecb") && !ivmode) {
			
 
				+	if (strcmp(chainmode, "ecb") && !*ivmode) {
			
 
				 		ti->error = "IV mechanism required";
			
 
				 		return -EINVAL;
			
 
				 	}
			
@@ -1773,60 +2448,45 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
				 	ret = crypt_alloc_tfms(cc, cipher_api);
			
 
				 	if (ret < 0) {
			
 
				 		ti->error = "Error allocating crypto tfm";
			
 
				-		goto bad;
			
 
				+		kfree(cipher_api);
			
 
				+		return ret;
			
 
				 	}
			
 
				 
			
 
				-	/* Initialize IV */
			
 
				-	cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
			
 
				-	if (cc->iv_size)
			
 
				-		/* at least a 64 bit sector number should fit in our buffer */
			
 
				-		cc->iv_size = max(cc->iv_size,
			
 
				-				  (unsigned int)(sizeof(u64) / sizeof(u8)));
			
 
				-	else if (ivmode) {
			
 
				-		DMWARN("Selected cipher does not support IVs");
			
 
				-		ivmode = NULL;
			
 
				-	}
			
 
				+	return 0;
			
 
				+bad_mem:
			
 
				+	ti->error = "Cannot allocate cipher strings";
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				 
			
 
				-	/* Choose ivmode, see comments at iv code. */
			
 
				-	if (ivmode == NULL)
			
 
				-		cc->iv_gen_ops = NULL;
			
 
				-	else if (strcmp(ivmode, "plain") == 0)
			
 
				-		cc->iv_gen_ops = &crypt_iv_plain_ops;
			
 
				-	else if (strcmp(ivmode, "plain64") == 0)
			
 
				-		cc->iv_gen_ops = &crypt_iv_plain64_ops;
			
 
				-	else if (strcmp(ivmode, "essiv") == 0)
			
 
				-		cc->iv_gen_ops = &crypt_iv_essiv_ops;
			
 
				-	else if (strcmp(ivmode, "benbi") == 0)
			
 
				-		cc->iv_gen_ops = &crypt_iv_benbi_ops;
			
 
				-	else if (strcmp(ivmode, "null") == 0)
			
 
				-		cc->iv_gen_ops = &crypt_iv_null_ops;
			
 
				-	else if (strcmp(ivmode, "lmk") == 0) {
			
 
				-		cc->iv_gen_ops = &crypt_iv_lmk_ops;
			
 
				-		/*
			
 
				-		 * Version 2 and 3 is recognised according
			
 
				-		 * to length of provided multi-key string.
			
 
				-		 * If present (version 3), last key is used as IV seed.
			
 
				-		 * All keys (including IV seed) are always the same size.
			
 
				-		 */
			
 
				-		if (cc->key_size % cc->key_parts) {
			
 
				-			cc->key_parts++;
			
 
				-			cc->key_extra_size = cc->key_size / cc->key_parts;
			
 
				-		}
			
 
				-	} else if (strcmp(ivmode, "tcw") == 0) {
			
 
				-		cc->iv_gen_ops = &crypt_iv_tcw_ops;
			
 
				-		cc->key_parts += 2; /* IV + whitening */
			
 
				-		cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
			
 
				-	} else {
			
 
				-		ret = -EINVAL;
			
 
				-		ti->error = "Invalid IV mode";
			
 
				-		goto bad;
			
 
				+static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
			
 
				+{
			
 
				+	struct crypt_config *cc = ti->private;
			
 
				+	char *ivmode = NULL, *ivopts = NULL;
			
 
				+	int ret;
			
 
				+
			
 
				+	cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
			
 
				+	if (!cc->cipher_string) {
			
 
				+		ti->error = "Cannot allocate cipher strings";
			
 
				+		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				+	if (strstarts(cipher_in, "capi:"))
			
 
				+		ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts);
			
 
				+	else
			
 
				+		ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* Initialize IV */
			
 
				+	ret = crypt_ctr_ivmode(ti, ivmode);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				 	/* Initialize and set key */
			
 
				 	ret = crypt_set_key(cc, key);
			
 
				 	if (ret < 0) {
			
 
				 		ti->error = "Error decoding and setting key";
			
 
				-		goto bad;
			
 
				+		return ret;
			
 
				 	}
			
 
				 
			
 
				 	/* Allocate IV */
			
@@ -1834,7 +2494,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
				 		ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
			
 
				 		if (ret < 0) {
			
 
				 			ti->error = "Error creating IV";
			
 
				-			goto bad;
			
 
				+			return ret;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1843,18 +2503,82 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
				 		ret = cc->iv_gen_ops->init(cc);
			
 
				 		if (ret < 0) {
			
 
				 			ti->error = "Error initialising IV";
			
 
				-			goto bad;
			
 
				+			return ret;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	ret = 0;
			
 
				-bad:
			
 
				-	kfree(cipher_api);
			
 
				 	return ret;
			
 
				+}
			
 
				 
			
 
				-bad_mem:
			
 
				-	ti->error = "Cannot allocate cipher strings";
			
 
				-	return -ENOMEM;
			
 
				+static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
			
 
				+{
			
 
				+	struct crypt_config *cc = ti->private;
			
 
				+	struct dm_arg_set as;
			
 
				+	static struct dm_arg _args[] = {
			
 
				+		{0, 6, "Invalid number of feature args"},
			
 
				+	};
			
 
				+	unsigned int opt_params, val;
			
 
				+	const char *opt_string, *sval;
			
 
				+	char dummy;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* Optional parameters */
			
 
				+	as.argc = argc;
			
 
				+	as.argv = argv;
			
 
				+
			
 
				+	ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	while (opt_params--) {
			
 
				+		opt_string = dm_shift_arg(&as);
			
 
				+		if (!opt_string) {
			
 
				+			ti->error = "Not enough feature arguments";
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+
			
 
				+		if (!strcasecmp(opt_string, "allow_discards"))
			
 
				+			ti->num_discard_bios = 1;
			
 
				+
			
 
				+		else if (!strcasecmp(opt_string, "same_cpu_crypt"))
			
 
				+			set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
			
 
				+
			
 
				+		else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
			
 
				+			set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
			
 
				+		else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
			
 
				+			if (val == 0 || val > MAX_TAG_SIZE) {
			
 
				+				ti->error = "Invalid integrity arguments";
			
 
				+				return -EINVAL;
			
 
				+			}
			
 
				+			cc->on_disk_tag_size = val;
			
 
				+			sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
			
 
				+			if (!strcasecmp(sval, "aead")) {
			
 
				+				set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
			
 
				+			} else  if (strcasecmp(sval, "none")) {
			
 
				+				ti->error = "Unknown integrity profile";
			
 
				+				return -EINVAL;
			
 
				+			}
			
 
				+
			
 
				+			cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
			
 
				+			if (!cc->cipher_auth)
			
 
				+				return -ENOMEM;
			
 
				+		} else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
			
 
				+			if (cc->sector_size < (1 << SECTOR_SHIFT) ||
			
 
				+			    cc->sector_size > 4096 ||
			
 
				+			    (cc->sector_size & (cc->sector_size - 1))) {
			
 
				+				ti->error = "Invalid feature value for sector_size";
			
 
				+				return -EINVAL;
			
 
				+			}
			
 
				+			cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
			
 
				+		} else if (!strcasecmp(opt_string, "iv_large_sectors"))
			
 
				+			set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
			
 
				+		else {
			
 
				+			ti->error = "Invalid feature arguments";
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1865,18 +2589,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 {
			
 
				 	struct crypt_config *cc;
			
 
				 	int key_size;
			
 
				-	unsigned int opt_params;
			
 
				+	unsigned int align_mask;
			
 
				 	unsigned long long tmpll;
			
 
				 	int ret;
			
 
				-	size_t iv_size_padding;
			
 
				-	struct dm_arg_set as;
			
 
				-	const char *opt_string;
			
 
				+	size_t iv_size_padding, additional_req_size;
			
 
				 	char dummy;
			
 
				 
			
 
				-	static struct dm_arg _args[] = {
			
 
				-		{0, 3, "Invalid number of feature args"},
			
 
				-	};
			
 
				-
			
 
				 	if (argc < 5) {
			
 
				 		ti->error = "Not enough arguments";
			
 
				 		return -EINVAL;
			
@@ -1894,40 +2612,63 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 	cc->key_size = key_size;
			
 
				+	cc->sector_size = (1 << SECTOR_SHIFT);
			
 
				+	cc->sector_shift = 0;
			
 
				 
			
 
				 	ti->private = cc;
			
 
				+
			
 
				+	/* Optional parameters need to be read before cipher constructor */
			
 
				+	if (argc > 5) {
			
 
				+		ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
			
 
				+		if (ret)
			
 
				+			goto bad;
			
 
				+	}
			
 
				+
			
 
				 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
			
 
				 	if (ret < 0)
			
 
				 		goto bad;
			
 
				 
			
 
				-	cc->dmreq_start = sizeof(struct skcipher_request);
			
 
				-	cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
			
 
				+	if (crypt_integrity_aead(cc)) {
			
 
				+		cc->dmreq_start = sizeof(struct aead_request);
			
 
				+		cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
			
 
				+		align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
			
 
				+	} else {
			
 
				+		cc->dmreq_start = sizeof(struct skcipher_request);
			
 
				+		cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
			
 
				+		align_mask = crypto_skcipher_alignmask(any_tfm(cc));
			
 
				+	}
			
 
				 	cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
			
 
				 
			
 
				-	if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
			
 
				+	if (align_mask < CRYPTO_MINALIGN) {
			
 
				 		/* Allocate the padding exactly */
			
 
				 		iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
			
 
				-				& crypto_skcipher_alignmask(any_tfm(cc));
			
 
				+				& align_mask;
			
 
				 	} else {
			
 
				 		/*
			
 
				 		 * If the cipher requires greater alignment than kmalloc
			
 
				 		 * alignment, we don't know the exact position of the
			
 
				 		 * initialization vector. We must assume worst case.
			
 
				 		 */
			
 
				-		iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc));
			
 
				+		iv_size_padding = align_mask;
			
 
				 	}
			
 
				 
			
 
				 	ret = -ENOMEM;
			
 
				-	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
			
 
				-			sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
			
 
				+
			
 
				+	/*  ...| IV + padding | original IV | original sec. number | bio tag offset | */
			
 
				+	additional_req_size = sizeof(struct dm_crypt_request) +
			
 
				+		iv_size_padding + cc->iv_size +
			
 
				+		cc->iv_size +
			
 
				+		sizeof(uint64_t) +
			
 
				+		sizeof(unsigned int);
			
 
				+
			
 
				+	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
			
 
				 	if (!cc->req_pool) {
			
 
				 		ti->error = "Cannot allocate crypt request mempool";
			
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				 	cc->per_bio_data_size = ti->per_io_data_size =
			
 
				-		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
			
 
				-		      sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
			
 
				+		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
			
 
				 		      ARCH_KMALLOC_MINALIGN);
			
 
				 
			
 
				 	cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
			
@@ -1945,7 +2686,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	mutex_init(&cc->bio_alloc_lock);
			
 
				 
			
 
				 	ret = -EINVAL;
			
 
				-	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
			
 
				+	if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
			
 
				+	    (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
			
 
				 		ti->error = "Invalid iv_offset sector";
			
 
				 		goto bad;
			
 
				 	}
			
@@ -1964,53 +2706,37 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	}
			
 
				 	cc->start = tmpll;
			
 
				 
			
 
				-	argv += 5;
			
 
				-	argc -= 5;
			
 
				-
			
 
				-	/* Optional parameters */
			
 
				-	if (argc) {
			
 
				-		as.argc = argc;
			
 
				-		as.argv = argv;
			
 
				-
			
 
				-		ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
			
 
				+	if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
			
 
				+		ret = crypt_integrity_ctr(cc, ti);
			
 
				 		if (ret)
			
 
				 			goto bad;
			
 
				 
			
 
				-		ret = -EINVAL;
			
 
				-		while (opt_params--) {
			
 
				-			opt_string = dm_shift_arg(&as);
			
 
				-			if (!opt_string) {
			
 
				-				ti->error = "Not enough feature arguments";
			
 
				-				goto bad;
			
 
				-			}
			
 
				-
			
 
				-			if (!strcasecmp(opt_string, "allow_discards"))
			
 
				-				ti->num_discard_bios = 1;
			
 
				+		cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
			
 
				+		if (!cc->tag_pool_max_sectors)
			
 
				+			cc->tag_pool_max_sectors = 1;
			
 
				 
			
 
				-			else if (!strcasecmp(opt_string, "same_cpu_crypt"))
			
 
				-				set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
			
 
				-
			
 
				-			else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
			
 
				-				set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
			
 
				-
			
 
				-			else {
			
 
				-				ti->error = "Invalid feature arguments";
			
 
				-				goto bad;
			
 
				-			}
			
 
				+		cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
			
 
				+			cc->tag_pool_max_sectors * cc->on_disk_tag_size);
			
 
				+		if (!cc->tag_pool) {
			
 
				+			ti->error = "Cannot allocate integrity tags mempool";
			
 
				+			goto bad;
			
 
				 		}
			
 
				+
			
 
				+		cc->tag_pool_max_sectors <<= cc->sector_shift;
			
 
				 	}
			
 
				 
			
 
				 	ret = -ENOMEM;
			
 
				-	cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
			
 
				+	cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
			
 
				 	if (!cc->io_queue) {
			
 
				 		ti->error = "Couldn't create kcryptd io queue";
			
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				 	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
			
 
				-		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
			
 
				+		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
			
 
				 	else
			
 
				-		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
			
 
				+		cc->crypt_queue = alloc_workqueue("kcryptd",
			
 
				+						  WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
			
 
				 						  num_online_cpus());
			
 
				 	if (!cc->crypt_queue) {
			
 
				 		ti->error = "Couldn't create kcryptd queue";
			
@@ -2061,12 +2787,39 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 
				 	 * Check if bio is too large, split as needed.
			
 
				 	 */
			
 
				 	if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
			
 
				-	    bio_data_dir(bio) == WRITE)
			
 
				+	    (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
			
 
				 		dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
			
 
				 
			
 
				+	/*
			
 
				+	 * Ensure that bio is a multiple of internal sector encryption size
			
 
				+	 * and is aligned to this size as defined in IO hints.
			
 
				+	 */
			
 
				+	if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
			
 
				+		return -EIO;
			
 
				+
			
 
				 	io = dm_per_bio_data(bio, cc->per_bio_data_size);
			
 
				 	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
			
 
				-	io->ctx.req = (struct skcipher_request *)(io + 1);
			
 
				+
			
 
				+	if (cc->on_disk_tag_size) {
			
 
				+		unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
			
 
				+
			
 
				+		if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
			
 
				+		    unlikely(!(io->integrity_metadata = kmalloc(tag_len,
			
 
				+				GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
			
 
				+			if (bio_sectors(bio) > cc->tag_pool_max_sectors)
			
 
				+				dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
			
 
				+			io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
			
 
				+			io->integrity_metadata_from_pool = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (crypt_integrity_aead(cc))
			
 
				+		io->ctx.r.req_aead = (struct aead_request *)(io + 1);
			
 
				+	else
			
 
				+		io->ctx.r.req = (struct skcipher_request *)(io + 1);
			
 
				 
			
 
				 	if (bio_data_dir(io->base_bio) == READ) {
			
 
				 		if (kcryptd_io_read(io, GFP_NOWAIT))
			
@@ -2107,6 +2860,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 
				 		num_feature_args += !!ti->num_discard_bios;
			
 
				 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
			
 
				 		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
			
 
				+		num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
			
 
				+		num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
			
 
				+		if (cc->on_disk_tag_size)
			
 
				+			num_feature_args++;
			
 
				 		if (num_feature_args) {
			
 
				 			DMEMIT(" %d", num_feature_args);
			
 
				 			if (ti->num_discard_bios)
			
@@ -2115,6 +2872,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 
				 				DMEMIT(" same_cpu_crypt");
			
 
				 			if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
			
 
				 				DMEMIT(" submit_from_crypt_cpus");
			
 
				+			if (cc->on_disk_tag_size)
			
 
				+				DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
			
 
				+			if (cc->sector_size != (1 << SECTOR_SHIFT))
			
 
				+				DMEMIT(" sector_size:%d", cc->sector_size);
			
 
				+			if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
			
 
				+				DMEMIT(" iv_large_sectors");
			
 
				 		}
			
 
				 
			
 
				 		break;
			
@@ -2204,6 +2967,8 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
 
				 {
			
 
				+	struct crypt_config *cc = ti->private;
			
 
				+
			
 
				 	/*
			
 
				 	 * Unfortunate constraint that is required to avoid the potential
			
 
				 	 * for exceeding underlying device's max_segments limits -- due to
			
@@ -2211,11 +2976,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
				 	 * bio that are not as physically contiguous as the original bio.
			
 
				 	 */
			
 
				 	limits->max_segment_size = PAGE_SIZE;
			
 
				+
			
 
				+	if (cc->sector_size != (1 << SECTOR_SHIFT)) {
			
 
				+		limits->logical_block_size = cc->sector_size;
			
 
				+		limits->physical_block_size = cc->sector_size;
			
 
				+		blk_limits_io_min(limits, cc->sector_size);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static struct target_type crypt_target = {
			
 
				 	.name   = "crypt",
			
 
				-	.version = {1, 15, 0},
			
 
				+	.version = {1, 17, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = crypt_ctr,
			
 
				 	.dtr    = crypt_dtr,
			
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -340,6 +340,7 @@ out:
 
				 static struct target_type delay_target = {
			
 
				 	.name	     = "delay",
			
 
				 	.version     = {1, 2, 1},
			
 
				+	.features    = DM_TARGET_PASSES_INTEGRITY,
			
 
				 	.module      = THIS_MODULE,
			
 
				 	.ctr	     = delay_ctr,
			
 
				 	.dtr	     = delay_dtr,
			
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = {
 
				  * Low level metadata handling
			
 
				  *--------------------------------------------------------------*/
			
 
				 #define DM_ERA_METADATA_BLOCK_SIZE 4096
			
 
				-#define DM_ERA_METADATA_CACHE_SIZE 64
			
 
				 #define ERA_MAX_CONCURRENT_LOCKS 5
			
 
				 
			
 
				 struct era_metadata {
			
@@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md,
 
				 	int r;
			
 
				 
			
 
				 	md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
			
 
				-					 DM_ERA_METADATA_CACHE_SIZE,
			
 
				 					 ERA_MAX_CONCURRENT_LOCKS);
			
 
				 	if (IS_ERR(md->bm)) {
			
 
				 		DMERR("could not create block manager");
			
@@ -961,15 +959,15 @@ static int metadata_commit(struct era_metadata *md)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	r = save_sm_root(md);
			
 
				+	r = dm_tm_pre_commit(md->tm);
			
 
				 	if (r) {
			
 
				-		DMERR("%s: save_sm_root failed", __func__);
			
 
				+		DMERR("%s: pre commit failed", __func__);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
 
				-	r = dm_tm_pre_commit(md->tm);
			
 
				+	r = save_sm_root(md);
			
 
				 	if (r) {
			
 
				-		DMERR("%s: pre commit failed", __func__);
			
 
				+		DMERR("%s: save_sm_root failed", __func__);
			
 
				 		return r;
			
 
				 	}
			
 
				 
			
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -0,0 +1,3238 @@
 
				+/*
			
 
				+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
			
 
				+ * Copyright (C) 2016-2017 Milan Broz
			
 
				+ * Copyright (C) 2016-2017 Mikulas Patocka
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/device-mapper.h>
			
 
				+#include <linux/dm-io.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/sort.h>
			
 
				+#include <linux/rbtree.h>
			
 
				+#include <linux/delay.h>
			
 
				+#include <linux/random.h>
			
 
				+#include <crypto/hash.h>
			
 
				+#include <crypto/skcipher.h>
			
 
				+#include <linux/async_tx.h>
			
 
				+#include "dm-bufio.h"
			
 
				+
			
 
				+#define DM_MSG_PREFIX "integrity"
			
 
				+
			
 
				+#define DEFAULT_INTERLEAVE_SECTORS	32768
			
 
				+#define DEFAULT_JOURNAL_SIZE_FACTOR	7
			
 
				+#define DEFAULT_BUFFER_SECTORS		128
			
 
				+#define DEFAULT_JOURNAL_WATERMARK	50
			
 
				+#define DEFAULT_SYNC_MSEC		10000
			
 
				+#define DEFAULT_MAX_JOURNAL_SECTORS	131072
			
 
				+#define MIN_LOG2_INTERLEAVE_SECTORS	3
			
 
				+#define MAX_LOG2_INTERLEAVE_SECTORS	31
			
 
				+#define METADATA_WORKQUEUE_MAX_ACTIVE	16
			
 
				+
			
 
				+/*
			
 
				+ * Warning - DEBUG_PRINT prints security-sensitive data to the log,
			
 
				+ * so it should not be enabled in the official kernel
			
 
				+ */
			
 
				+//#define DEBUG_PRINT
			
 
				+//#define INTERNAL_VERIFY
			
 
				+
			
 
				+/*
			
 
				+ * On disk structures
			
 
				+ */
			
 
				+
			
 
				+#define SB_MAGIC			"integrt"
			
 
				+#define SB_VERSION			1
			
 
				+#define SB_SECTORS			8
			
 
				+#define MAX_SECTORS_PER_BLOCK		8
			
 
				+
			
 
				+struct superblock {
			
 
				+	__u8 magic[8];
			
 
				+	__u8 version;
			
 
				+	__u8 log2_interleave_sectors;
			
 
				+	__u16 integrity_tag_size;
			
 
				+	__u32 journal_sections;
			
 
				+	__u64 provided_data_sectors;	/* userspace uses this value */
			
 
				+	__u32 flags;
			
 
				+	__u8 log2_sectors_per_block;
			
 
				+};
			
 
				+
			
 
				+#define SB_FLAG_HAVE_JOURNAL_MAC	0x1
			
 
				+
			
 
				+#define	JOURNAL_ENTRY_ROUNDUP		8
			
 
				+
			
 
				+typedef __u64 commit_id_t;
			
 
				+#define JOURNAL_MAC_PER_SECTOR		8
			
 
				+
			
 
				+struct journal_entry {
			
 
				+	union {
			
 
				+		struct {
			
 
				+			__u32 sector_lo;
			
 
				+			__u32 sector_hi;
			
 
				+		} s;
			
 
				+		__u64 sector;
			
 
				+	} u;
			
 
				+	commit_id_t last_bytes[0];
			
 
				+	/* __u8 tag[0]; */
			
 
				+};
			
 
				+
			
 
				+#define journal_entry_tag(ic, je)		((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
			
 
				+
			
 
				+#if BITS_PER_LONG == 64
			
 
				+#define journal_entry_set_sector(je, x)		do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
			
 
				+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
			
 
				+#elif defined(CONFIG_LBDAF)
			
 
				+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
			
 
				+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
			
 
				+#else
			
 
				+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
			
 
				+#define journal_entry_get_sector(je)		le32_to_cpu((je)->u.s.sector_lo)
			
 
				+#endif
			
 
				+#define journal_entry_is_unused(je)		((je)->u.s.sector_hi == cpu_to_le32(-1))
			
 
				+#define journal_entry_set_unused(je)		do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
			
 
				+#define journal_entry_is_inprogress(je)		((je)->u.s.sector_hi == cpu_to_le32(-2))
			
 
				+#define journal_entry_set_inprogress(je)	do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
			
 
				+
			
 
				+#define JOURNAL_BLOCK_SECTORS		8
			
 
				+#define JOURNAL_SECTOR_DATA		((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
			
 
				+#define JOURNAL_MAC_SIZE		(JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
			
 
				+
			
 
				+struct journal_sector {
			
 
				+	__u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
			
 
				+	__u8 mac[JOURNAL_MAC_PER_SECTOR];
			
 
				+	commit_id_t commit_id;
			
 
				+};
			
 
				+
			
 
				+#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
			
 
				+
			
 
				+#define METADATA_PADDING_SECTORS	8
			
 
				+
			
 
				+#define N_COMMIT_IDS			4
			
 
				+
			
 
				+static unsigned char prev_commit_seq(unsigned char seq)
			
 
				+{
			
 
				+	return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
			
 
				+}
			
 
				+
			
 
				+static unsigned char next_commit_seq(unsigned char seq)
			
 
				+{
			
 
				+	return (seq + 1) % N_COMMIT_IDS;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * In-memory structures
			
 
				+ */
			
 
				+
			
 
				+struct journal_node {
			
 
				+	struct rb_node node;
			
 
				+	sector_t sector;
			
 
				+};
			
 
				+
			
 
				+struct alg_spec {
			
 
				+	char *alg_string;
			
 
				+	char *key_string;
			
 
				+	__u8 *key;
			
 
				+	unsigned key_size;
			
 
				+};
			
 
				+
			
 
				+struct dm_integrity_c {
			
 
				+	struct dm_dev *dev;
			
 
				+	unsigned tag_size;
			
 
				+	__s8 log2_tag_size;
			
 
				+	sector_t start;
			
 
				+	mempool_t *journal_io_mempool;
			
 
				+	struct dm_io_client *io;
			
 
				+	struct dm_bufio_client *bufio;
			
 
				+	struct workqueue_struct *metadata_wq;
			
 
				+	struct superblock *sb;
			
 
				+	unsigned journal_pages;
			
 
				+	struct page_list *journal;
			
 
				+	struct page_list *journal_io;
			
 
				+	struct page_list *journal_xor;
			
 
				+
			
 
				+	struct crypto_skcipher *journal_crypt;
			
 
				+	struct scatterlist **journal_scatterlist;
			
 
				+	struct scatterlist **journal_io_scatterlist;
			
 
				+	struct skcipher_request **sk_requests;
			
 
				+
			
 
				+	struct crypto_shash *journal_mac;
			
 
				+
			
 
				+	struct journal_node *journal_tree;
			
 
				+	struct rb_root journal_tree_root;
			
 
				+
			
 
				+	sector_t provided_data_sectors;
			
 
				+
			
 
				+	unsigned short journal_entry_size;
			
 
				+	unsigned char journal_entries_per_sector;
			
 
				+	unsigned char journal_section_entries;
			
 
				+	unsigned short journal_section_sectors;
			
 
				+	unsigned journal_sections;
			
 
				+	unsigned journal_entries;
			
 
				+	sector_t device_sectors;
			
 
				+	unsigned initial_sectors;
			
 
				+	unsigned metadata_run;
			
 
				+	__s8 log2_metadata_run;
			
 
				+	__u8 log2_buffer_sectors;
			
 
				+	__u8 sectors_per_block;
			
 
				+
			
 
				+	unsigned char mode;
			
 
				+	bool suspending;
			
 
				+
			
 
				+	int failed;
			
 
				+
			
 
				+	struct crypto_shash *internal_hash;
			
 
				+
			
 
				+	/* these variables are locked with endio_wait.lock */
			
 
				+	struct rb_root in_progress;
			
 
				+	wait_queue_head_t endio_wait;
			
 
				+	struct workqueue_struct *wait_wq;
			
 
				+
			
 
				+	unsigned char commit_seq;
			
 
				+	commit_id_t commit_ids[N_COMMIT_IDS];
			
 
				+
			
 
				+	unsigned committed_section;
			
 
				+	unsigned n_committed_sections;
			
 
				+
			
 
				+	unsigned uncommitted_section;
			
 
				+	unsigned n_uncommitted_sections;
			
 
				+
			
 
				+	unsigned free_section;
			
 
				+	unsigned char free_section_entry;
			
 
				+	unsigned free_sectors;
			
 
				+
			
 
				+	unsigned free_sectors_threshold;
			
 
				+
			
 
				+	struct workqueue_struct *commit_wq;
			
 
				+	struct work_struct commit_work;
			
 
				+
			
 
				+	struct workqueue_struct *writer_wq;
			
 
				+	struct work_struct writer_work;
			
 
				+
			
 
				+	struct bio_list flush_bio_list;
			
 
				+
			
 
				+	unsigned long autocommit_jiffies;
			
 
				+	struct timer_list autocommit_timer;
			
 
				+	unsigned autocommit_msec;
			
 
				+
			
 
				+	wait_queue_head_t copy_to_journal_wait;
			
 
				+
			
 
				+	struct completion crypto_backoff;
			
 
				+
			
 
				+	bool journal_uptodate;
			
 
				+	bool just_formatted;
			
 
				+
			
 
				+	struct alg_spec internal_hash_alg;
			
 
				+	struct alg_spec journal_crypt_alg;
			
 
				+	struct alg_spec journal_mac_alg;
			
 
				+};
			
 
				+
			
 
				+struct dm_integrity_range {
			
 
				+	sector_t logical_sector;
			
 
				+	unsigned n_sectors;
			
 
				+	struct rb_node node;
			
 
				+};
			
 
				+
			
 
				+struct dm_integrity_io {
			
 
				+	struct work_struct work;
			
 
				+
			
 
				+	struct dm_integrity_c *ic;
			
 
				+	bool write;
			
 
				+	bool fua;
			
 
				+
			
 
				+	struct dm_integrity_range range;
			
 
				+
			
 
				+	sector_t metadata_block;
			
 
				+	unsigned metadata_offset;
			
 
				+
			
 
				+	atomic_t in_flight;
			
 
				+	int bi_error;
			
 
				+
			
 
				+	struct completion *completion;
			
 
				+
			
 
				+	struct block_device *orig_bi_bdev;
			
 
				+	bio_end_io_t *orig_bi_end_io;
			
 
				+	struct bio_integrity_payload *orig_bi_integrity;
			
 
				+	struct bvec_iter orig_bi_iter;
			
 
				+};
			
 
				+
			
 
				+struct journal_completion {
			
 
				+	struct dm_integrity_c *ic;
			
 
				+	atomic_t in_flight;
			
 
				+	struct completion comp;
			
 
				+};
			
 
				+
			
 
				+struct journal_io {
			
 
				+	struct dm_integrity_range range;
			
 
				+	struct journal_completion *comp;
			
 
				+};
			
 
				+
			
 
				+static struct kmem_cache *journal_io_cache;
			
 
				+
			
 
				+#define JOURNAL_IO_MEMPOOL	32
			
 
				+
			
 
				+#ifdef DEBUG_PRINT
			
 
				+#define DEBUG_print(x, ...)	printk(KERN_DEBUG x, ##__VA_ARGS__)
			
 
				+static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
			
 
				+{
			
 
				+	va_list args;
			
 
				+	va_start(args, msg);
			
 
				+	vprintk(msg, args);
			
 
				+	va_end(args);
			
 
				+	if (len)
			
 
				+		pr_cont(":");
			
 
				+	while (len) {
			
 
				+		pr_cont(" %02x", *bytes);
			
 
				+		bytes++;
			
 
				+		len--;
			
 
				+	}
			
 
				+	pr_cont("\n");
			
 
				+}
			
 
				+#define DEBUG_bytes(bytes, len, msg, ...)	__DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
			
 
				+#else
			
 
				+#define DEBUG_print(x, ...)			do { } while (0)
			
 
				+#define DEBUG_bytes(bytes, len, msg, ...)	do { } while (0)
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * DM Integrity profile, protection is performed layer above (dm-crypt)
			
 
				+ */
			
 
				+static struct blk_integrity_profile dm_integrity_profile = {
			
 
				+	.name			= "DM-DIF-EXT-TAG",
			
 
				+	.generate_fn		= NULL,
			
 
				+	.verify_fn		= NULL,
			
 
				+};
			
 
				+
			
 
				+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
			
 
				+static void integrity_bio_wait(struct work_struct *w);
			
 
				+static void dm_integrity_dtr(struct dm_target *ti);
			
 
				+
			
 
				+static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
			
 
				+{
			
 
				+	if (!cmpxchg(&ic->failed, 0, err))
			
 
				+		DMERR("Error on %s: %d", msg, err);
			
 
				+}
			
 
				+
			
 
				+static int dm_integrity_failed(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	return ACCESS_ONCE(ic->failed);
			
 
				+}
			
 
				+
			
 
				+static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
			
 
				+					  unsigned j, unsigned char seq)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Xor the number with section and sector, so that if a piece of
			
 
				+	 * journal is written at wrong place, it is detected.
			
 
				+	 */
			
 
				+	return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
			
 
				+}
			
 
				+
			
 
				+static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
			
 
				+				sector_t *area, sector_t *offset)
			
 
				+{
			
 
				+	__u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
			
 
				+
			
 
				+	*area = data_sector >> log2_interleave_sectors;
			
 
				+	*offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
			
 
				+}
			
 
				+
			
 
				+#define sector_to_block(ic, n)						\
			
 
				+do {									\
			
 
				+	BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));		\
			
 
				+	(n) >>= (ic)->sb->log2_sectors_per_block;			\
			
 
				+} while (0)
			
 
				+
			
 
				+static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
			
 
				+					    sector_t offset, unsigned *metadata_offset)
			
 
				+{
			
 
				+	__u64 ms;
			
 
				+	unsigned mo;
			
 
				+
			
 
				+	ms = area << ic->sb->log2_interleave_sectors;
			
 
				+	if (likely(ic->log2_metadata_run >= 0))
			
 
				+		ms += area << ic->log2_metadata_run;
			
 
				+	else
			
 
				+		ms += area * ic->metadata_run;
			
 
				+	ms >>= ic->log2_buffer_sectors;
			
 
				+
			
 
				+	sector_to_block(ic, offset);
			
 
				+
			
 
				+	if (likely(ic->log2_tag_size >= 0)) {
			
 
				+		ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
			
 
				+		mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
			
 
				+	} else {
			
 
				+		ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
			
 
				+		mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
			
 
				+	}
			
 
				+	*metadata_offset = mo;
			
 
				+	return ms;
			
 
				+}
			
 
				+
			
 
				+static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
			
 
				+{
			
 
				+	sector_t result;
			
 
				+
			
 
				+	result = area << ic->sb->log2_interleave_sectors;
			
 
				+	if (likely(ic->log2_metadata_run >= 0))
			
 
				+		result += (area + 1) << ic->log2_metadata_run;
			
 
				+	else
			
 
				+		result += (area + 1) * ic->metadata_run;
			
 
				+
			
 
				+	result += (sector_t)ic->initial_sectors + offset;
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
			
 
				+{
			
 
				+	if (unlikely(*sec_ptr >= ic->journal_sections))
			
 
				+		*sec_ptr -= ic->journal_sections;
			
 
				+}
			
 
				+
			
 
				+static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
			
 
				+{
			
 
				+	struct dm_io_request io_req;
			
 
				+	struct dm_io_region io_loc;
			
 
				+
			
 
				+	io_req.bi_op = op;
			
 
				+	io_req.bi_op_flags = op_flags;
			
 
				+	io_req.mem.type = DM_IO_KMEM;
			
 
				+	io_req.mem.ptr.addr = ic->sb;
			
 
				+	io_req.notify.fn = NULL;
			
 
				+	io_req.client = ic->io;
			
 
				+	io_loc.bdev = ic->dev->bdev;
			
 
				+	io_loc.sector = ic->start;
			
 
				+	io_loc.count = SB_SECTORS;
			
 
				+
			
 
				+	return dm_io(&io_req, 1, &io_loc, NULL);
			
 
				+}
			
 
				+
			
 
				+static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
			
 
				+				 bool e, const char *function)
			
 
				+{
			
 
				+#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
			
 
				+	unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
			
 
				+
			
 
				+	if (unlikely(section >= ic->journal_sections) ||
			
 
				+	    unlikely(offset >= limit)) {
			
 
				+		printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
			
 
				+			function, section, offset, ic->journal_sections, limit);
			
 
				+		BUG();
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
			
 
				+			       unsigned *pl_index, unsigned *pl_offset)
			
 
				+{
			
 
				+	unsigned sector;
			
 
				+
			
 
				+	access_journal_check(ic, section, offset, false, "page_list_location");
			
 
				+
			
 
				+	sector = section * ic->journal_section_sectors + offset;
			
 
				+
			
 
				+	*pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
			
 
				+	*pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
			
 
				+}
			
 
				+
			
 
				+static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
			
 
				+					       unsigned section, unsigned offset, unsigned *n_sectors)
			
 
				+{
			
 
				+	unsigned pl_index, pl_offset;
			
 
				+	char *va;
			
 
				+
			
 
				+	page_list_location(ic, section, offset, &pl_index, &pl_offset);
			
 
				+
			
 
				+	if (n_sectors)
			
 
				+		*n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
			
 
				+
			
 
				+	va = lowmem_page_address(pl[pl_index].page);
			
 
				+
			
 
				+	return (struct journal_sector *)(va + pl_offset);
			
 
				+}
			
 
				+
			
 
				+static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
			
 
				+{
			
 
				+	return access_page_list(ic, ic->journal, section, offset, NULL);
			
 
				+}
			
 
				+
			
 
				+static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
			
 
				+{
			
 
				+	unsigned rel_sector, offset;
			
 
				+	struct journal_sector *js;
			
 
				+
			
 
				+	access_journal_check(ic, section, n, true, "access_journal_entry");
			
 
				+
			
 
				+	rel_sector = n % JOURNAL_BLOCK_SECTORS;
			
 
				+	offset = n / JOURNAL_BLOCK_SECTORS;
			
 
				+
			
 
				+	js = access_journal(ic, section, rel_sector);
			
 
				+	return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
			
 
				+}
			
 
				+
			
 
				+static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
			
 
				+{
			
 
				+	n <<= ic->sb->log2_sectors_per_block;
			
 
				+
			
 
				+	n += JOURNAL_BLOCK_SECTORS;
			
 
				+
			
 
				+	access_journal_check(ic, section, n, false, "access_journal_data");
			
 
				+
			
 
				+	return access_journal(ic, section, n);
			
 
				+}
			
 
				+
			
 
				+static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
			
 
				+{
			
 
				+	SHASH_DESC_ON_STACK(desc, ic->journal_mac);
			
 
				+	int r;
			
 
				+	unsigned j, size;
			
 
				+
			
 
				+	desc->tfm = ic->journal_mac;
			
 
				+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+
			
 
				+	r = crypto_shash_init(desc);
			
 
				+	if (unlikely(r)) {
			
 
				+		dm_integrity_io_error(ic, "crypto_shash_init", r);
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	for (j = 0; j < ic->journal_section_entries; j++) {
			
 
				+		struct journal_entry *je = access_journal_entry(ic, section, j);
			
 
				+		r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
			
 
				+		if (unlikely(r)) {
			
 
				+			dm_integrity_io_error(ic, "crypto_shash_update", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	size = crypto_shash_digestsize(ic->journal_mac);
			
 
				+
			
 
				+	if (likely(size <= JOURNAL_MAC_SIZE)) {
			
 
				+		r = crypto_shash_final(desc, result);
			
 
				+		if (unlikely(r)) {
			
 
				+			dm_integrity_io_error(ic, "crypto_shash_final", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				+		memset(result + size, 0, JOURNAL_MAC_SIZE - size);
			
 
				+	} else {
			
 
				+		__u8 digest[size];
			
 
				+		r = crypto_shash_final(desc, digest);
			
 
				+		if (unlikely(r)) {
			
 
				+			dm_integrity_io_error(ic, "crypto_shash_final", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				+		memcpy(result, digest, JOURNAL_MAC_SIZE);
			
 
				+	}
			
 
				+
			
 
				+	return;
			
 
				+err:
			
 
				+	memset(result, 0, JOURNAL_MAC_SIZE);
			
 
				+}
			
 
				+
			
 
				+static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
			
 
				+{
			
 
				+	__u8 result[JOURNAL_MAC_SIZE];
			
 
				+	unsigned j;
			
 
				+
			
 
				+	if (!ic->journal_mac)
			
 
				+		return;
			
 
				+
			
 
				+	section_mac(ic, section, result);
			
 
				+
			
 
				+	for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
			
 
				+		struct journal_sector *js = access_journal(ic, section, j);
			
 
				+
			
 
				+		if (likely(wr))
			
 
				+			memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
			
 
				+		else {
			
 
				+			if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
			
 
				+				dm_integrity_io_error(ic, "journal mac", -EILSEQ);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void complete_journal_op(void *context)
			
 
				+{
			
 
				+	struct journal_completion *comp = context;
			
 
				+	BUG_ON(!atomic_read(&comp->in_flight));
			
 
				+	if (likely(atomic_dec_and_test(&comp->in_flight)))
			
 
				+		complete(&comp->comp);
			
 
				+}
			
 
				+
			
 
				+static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
			
 
				+			unsigned n_sections, struct journal_completion *comp)
			
 
				+{
			
 
				+	struct async_submit_ctl submit;
			
 
				+	size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
			
 
				+	unsigned pl_index, pl_offset, section_index;
			
 
				+	struct page_list *source_pl, *target_pl;
			
 
				+
			
 
				+	if (likely(encrypt)) {
			
 
				+		source_pl = ic->journal;
			
 
				+		target_pl = ic->journal_io;
			
 
				+	} else {
			
 
				+		source_pl = ic->journal_io;
			
 
				+		target_pl = ic->journal;
			
 
				+	}
			
 
				+
			
 
				+	page_list_location(ic, section, 0, &pl_index, &pl_offset);
			
 
				+
			
 
				+	atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
			
 
				+
			
 
				+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
			
 
				+
			
 
				+	section_index = pl_index;
			
 
				+
			
 
				+	do {
			
 
				+		size_t this_step;
			
 
				+		struct page *src_pages[2];
			
 
				+		struct page *dst_page;
			
 
				+
			
 
				+		while (unlikely(pl_index == section_index)) {
			
 
				+			unsigned dummy;
			
 
				+			if (likely(encrypt))
			
 
				+				rw_section_mac(ic, section, true);
			
 
				+			section++;
			
 
				+			n_sections--;
			
 
				+			if (!n_sections)
			
 
				+				break;
			
 
				+			page_list_location(ic, section, 0, &section_index, &dummy);
			
 
				+		}
			
 
				+
			
 
				+		this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
			
 
				+		dst_page = target_pl[pl_index].page;
			
 
				+		src_pages[0] = source_pl[pl_index].page;
			
 
				+		src_pages[1] = ic->journal_xor[pl_index].page;
			
 
				+
			
 
				+		async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
			
 
				+
			
 
				+		pl_index++;
			
 
				+		pl_offset = 0;
			
 
				+		n_bytes -= this_step;
			
 
				+	} while (n_bytes);
			
 
				+
			
 
				+	BUG_ON(n_sections);
			
 
				+
			
 
				+	async_tx_issue_pending_all();
			
 
				+}
			
 
				+
			
 
				+static void complete_journal_encrypt(struct crypto_async_request *req, int err)
			
 
				+{
			
 
				+	struct journal_completion *comp = req->data;
			
 
				+	if (unlikely(err)) {
			
 
				+		if (likely(err == -EINPROGRESS)) {
			
 
				+			complete(&comp->ic->crypto_backoff);
			
 
				+			return;
			
 
				+		}
			
 
				+		dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
			
 
				+	}
			
 
				+	complete_journal_op(comp);
			
 
				+}
			
 
				+
			
 
				+static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
			
 
				+{
			
 
				+	int r;
			
 
				+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
			
 
				+				      complete_journal_encrypt, comp);
			
 
				+	if (likely(encrypt))
			
 
				+		r = crypto_skcipher_encrypt(req);
			
 
				+	else
			
 
				+		r = crypto_skcipher_decrypt(req);
			
 
				+	if (likely(!r))
			
 
				+		return false;
			
 
				+	if (likely(r == -EINPROGRESS))
			
 
				+		return true;
			
 
				+	if (likely(r == -EBUSY)) {
			
 
				+		wait_for_completion(&comp->ic->crypto_backoff);
			
 
				+		reinit_completion(&comp->ic->crypto_backoff);
			
 
				+		return true;
			
 
				+	}
			
 
				+	dm_integrity_io_error(comp->ic, "encrypt", r);
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
			
 
				+			  unsigned n_sections, struct journal_completion *comp)
			
 
				+{
			
 
				+	struct scatterlist **source_sg;
			
 
				+	struct scatterlist **target_sg;
			
 
				+
			
 
				+	atomic_add(2, &comp->in_flight);
			
 
				+
			
 
				+	if (likely(encrypt)) {
			
 
				+		source_sg = ic->journal_scatterlist;
			
 
				+		target_sg = ic->journal_io_scatterlist;
			
 
				+	} else {
			
 
				+		source_sg = ic->journal_io_scatterlist;
			
 
				+		target_sg = ic->journal_scatterlist;
			
 
				+	}
			
 
				+
			
 
				+	do {
			
 
				+		struct skcipher_request *req;
			
 
				+		unsigned ivsize;
			
 
				+		char *iv;
			
 
				+
			
 
				+		if (likely(encrypt))
			
 
				+			rw_section_mac(ic, section, true);
			
 
				+
			
 
				+		req = ic->sk_requests[section];
			
 
				+		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
			
 
				+		iv = req->iv;
			
 
				+
			
 
				+		memcpy(iv, iv + ivsize, ivsize);
			
 
				+
			
 
				+		req->src = source_sg[section];
			
 
				+		req->dst = target_sg[section];
			
 
				+
			
 
				+		if (unlikely(do_crypt(encrypt, req, comp)))
			
 
				+			atomic_inc(&comp->in_flight);
			
 
				+
			
 
				+		section++;
			
 
				+		n_sections--;
			
 
				+	} while (n_sections);
			
 
				+
			
 
				+	atomic_dec(&comp->in_flight);
			
 
				+	complete_journal_op(comp);
			
 
				+}
			
 
				+
			
 
				+static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
			
 
				+			    unsigned n_sections, struct journal_completion *comp)
			
 
				+{
			
 
				+	if (ic->journal_xor)
			
 
				+		return xor_journal(ic, encrypt, section, n_sections, comp);
			
 
				+	else
			
 
				+		return crypt_journal(ic, encrypt, section, n_sections, comp);
			
 
				+}
			
 
				+
			
 
				+static void complete_journal_io(unsigned long error, void *context)
			
 
				+{
			
 
				+	struct journal_completion *comp = context;
			
 
				+	if (unlikely(error != 0))
			
 
				+		dm_integrity_io_error(comp->ic, "writing journal", -EIO);
			
 
				+	complete_journal_op(comp);
			
 
				+}
			
 
				+
			
 
				+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
			
 
				+		       unsigned n_sections, struct journal_completion *comp)
			
 
				+{
			
 
				+	struct dm_io_request io_req;
			
 
				+	struct dm_io_region io_loc;
			
 
				+	unsigned sector, n_sectors, pl_index, pl_offset;
			
 
				+	int r;
			
 
				+
			
 
				+	if (unlikely(dm_integrity_failed(ic))) {
			
 
				+		if (comp)
			
 
				+			complete_journal_io(-1UL, comp);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	sector = section * ic->journal_section_sectors;
			
 
				+	n_sectors = n_sections * ic->journal_section_sectors;
			
 
				+
			
 
				+	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
			
 
				+	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
			
 
				+
			
 
				+	io_req.bi_op = op;
			
 
				+	io_req.bi_op_flags = op_flags;
			
 
				+	io_req.mem.type = DM_IO_PAGE_LIST;
			
 
				+	if (ic->journal_io)
			
 
				+		io_req.mem.ptr.pl = &ic->journal_io[pl_index];
			
 
				+	else
			
 
				+		io_req.mem.ptr.pl = &ic->journal[pl_index];
			
 
				+	io_req.mem.offset = pl_offset;
			
 
				+	if (likely(comp != NULL)) {
			
 
				+		io_req.notify.fn = complete_journal_io;
			
 
				+		io_req.notify.context = comp;
			
 
				+	} else {
			
 
				+		io_req.notify.fn = NULL;
			
 
				+	}
			
 
				+	io_req.client = ic->io;
			
 
				+	io_loc.bdev = ic->dev->bdev;
			
 
				+	io_loc.sector = ic->start + SB_SECTORS + sector;
			
 
				+	io_loc.count = n_sectors;
			
 
				+
			
 
				+	r = dm_io(&io_req, 1, &io_loc, NULL);
			
 
				+	if (unlikely(r)) {
			
 
				+		dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
			
 
				+		if (comp) {
			
 
				+			WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
			
 
				+			complete_journal_io(-1UL, comp);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
			
 
				+{
			
 
				+	struct journal_completion io_comp;
			
 
				+	struct journal_completion crypt_comp_1;
			
 
				+	struct journal_completion crypt_comp_2;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	io_comp.ic = ic;
			
 
				+	io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
			
 
				+
			
 
				+	if (commit_start + commit_sections <= ic->journal_sections) {
			
 
				+		io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
			
 
				+		if (ic->journal_io) {
			
 
				+			crypt_comp_1.ic = ic;
			
 
				+			crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
			
 
				+			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
			
 
				+			encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
			
 
				+			wait_for_completion_io(&crypt_comp_1.comp);
			
 
				+		} else {
			
 
				+			for (i = 0; i < commit_sections; i++)
			
 
				+				rw_section_mac(ic, commit_start + i, true);
			
 
				+		}
			
 
				+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
			
 
				+	} else {
			
 
				+		unsigned to_end;
			
 
				+		io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
			
 
				+		to_end = ic->journal_sections - commit_start;
			
 
				+		if (ic->journal_io) {
			
 
				+			crypt_comp_1.ic = ic;
			
 
				+			crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
			
 
				+			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
			
 
				+			encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
			
 
				+			if (try_wait_for_completion(&crypt_comp_1.comp)) {
			
 
				+				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
			
 
				+				crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
			
 
				+				crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
			
 
				+				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
			
 
				+				wait_for_completion_io(&crypt_comp_1.comp);
			
 
				+			} else {
			
 
				+				crypt_comp_2.ic = ic;
			
 
				+				crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
			
 
				+				crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
			
 
				+				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
			
 
				+				wait_for_completion_io(&crypt_comp_1.comp);
			
 
				+				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
			
 
				+				wait_for_completion_io(&crypt_comp_2.comp);
			
 
				+			}
			
 
				+		} else {
			
 
				+			for (i = 0; i < to_end; i++)
			
 
				+				rw_section_mac(ic, commit_start + i, true);
			
 
				+			rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
			
 
				+			for (i = 0; i < commit_sections - to_end; i++)
			
 
				+				rw_section_mac(ic, i, true);
			
 
				+		}
			
 
				+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
			
 
				+	}
			
 
				+
			
 
				+	wait_for_completion_io(&io_comp.comp);
			
 
				+}
			
 
				+
			
 
				+static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
			
 
				+			      unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
			
 
				+{
			
 
				+	struct dm_io_request io_req;
			
 
				+	struct dm_io_region io_loc;
			
 
				+	int r;
			
 
				+	unsigned sector, pl_index, pl_offset;
			
 
				+
			
 
				+	BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
			
 
				+
			
 
				+	if (unlikely(dm_integrity_failed(ic))) {
			
 
				+		fn(-1UL, data);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
			
 
				+
			
 
				+	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
			
 
				+	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
			
 
				+
			
 
				+	io_req.bi_op = REQ_OP_WRITE;
			
 
				+	io_req.bi_op_flags = 0;
			
 
				+	io_req.mem.type = DM_IO_PAGE_LIST;
			
 
				+	io_req.mem.ptr.pl = &ic->journal[pl_index];
			
 
				+	io_req.mem.offset = pl_offset;
			
 
				+	io_req.notify.fn = fn;
			
 
				+	io_req.notify.context = data;
			
 
				+	io_req.client = ic->io;
			
 
				+	io_loc.bdev = ic->dev->bdev;
			
 
				+	io_loc.sector = ic->start + target;
			
 
				+	io_loc.count = n_sectors;
			
 
				+
			
 
				+	r = dm_io(&io_req, 1, &io_loc, NULL);
			
 
				+	if (unlikely(r)) {
			
 
				+		WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
			
 
				+		fn(-1UL, data);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
			
 
				+{
			
 
				+	struct rb_node **n = &ic->in_progress.rb_node;
			
 
				+	struct rb_node *parent;
			
 
				+
			
 
				+	BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
			
 
				+
			
 
				+	parent = NULL;
			
 
				+
			
 
				+	while (*n) {
			
 
				+		struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
			
 
				+
			
 
				+		parent = *n;
			
 
				+		if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
			
 
				+			n = &range->node.rb_left;
			
 
				+		} else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
			
 
				+			n = &range->node.rb_right;
			
 
				+		} else {
			
 
				+			return false;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new_range->node, parent, n);
			
 
				+	rb_insert_color(&new_range->node, &ic->in_progress);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
			
 
				+{
			
 
				+	rb_erase(&range->node, &ic->in_progress);
			
 
				+	wake_up_locked(&ic->endio_wait);
			
 
				+}
			
 
				+
			
 
				+static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&ic->endio_wait.lock, flags);
			
 
				+	remove_range_unlocked(ic, range);
			
 
				+	spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void init_journal_node(struct journal_node *node)
			
 
				+{
			
 
				+	RB_CLEAR_NODE(&node->node);
			
 
				+	node->sector = (sector_t)-1;
			
 
				+}
			
 
				+
			
 
				+static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
			
 
				+{
			
 
				+	struct rb_node **link;
			
 
				+	struct rb_node *parent;
			
 
				+
			
 
				+	node->sector = sector;
			
 
				+	BUG_ON(!RB_EMPTY_NODE(&node->node));
			
 
				+
			
 
				+	link = &ic->journal_tree_root.rb_node;
			
 
				+	parent = NULL;
			
 
				+
			
 
				+	while (*link) {
			
 
				+		struct journal_node *j;
			
 
				+		parent = *link;
			
 
				+		j = container_of(parent, struct journal_node, node);
			
 
				+		if (sector < j->sector)
			
 
				+			link = &j->node.rb_left;
			
 
				+		else
			
 
				+			link = &j->node.rb_right;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&node->node, parent, link);
			
 
				+	rb_insert_color(&node->node, &ic->journal_tree_root);
			
 
				+}
			
 
				+
			
 
				+static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
			
 
				+{
			
 
				+	BUG_ON(RB_EMPTY_NODE(&node->node));
			
 
				+	rb_erase(&node->node, &ic->journal_tree_root);
			
 
				+	init_journal_node(node);
			
 
				+}
			
 
				+
			
 
				+#define NOT_FOUND	(-1U)
			
 
				+
			
 
				+static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
			
 
				+{
			
 
				+	struct rb_node *n = ic->journal_tree_root.rb_node;
			
 
				+	unsigned found = NOT_FOUND;
			
 
				+	*next_sector = (sector_t)-1;
			
 
				+	while (n) {
			
 
				+		struct journal_node *j = container_of(n, struct journal_node, node);
			
 
				+		if (sector == j->sector) {
			
 
				+			found = j - ic->journal_tree;
			
 
				+		}
			
 
				+		if (sector < j->sector) {
			
 
				+			*next_sector = j->sector;
			
 
				+			n = j->node.rb_left;
			
 
				+		} else {
			
 
				+			n = j->node.rb_right;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return found;
			
 
				+}
			
 
				+
			
 
				+static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
			
 
				+{
			
 
				+	struct journal_node *node, *next_node;
			
 
				+	struct rb_node *next;
			
 
				+
			
 
				+	if (unlikely(pos >= ic->journal_entries))
			
 
				+		return false;
			
 
				+	node = &ic->journal_tree[pos];
			
 
				+	if (unlikely(RB_EMPTY_NODE(&node->node)))
			
 
				+		return false;
			
 
				+	if (unlikely(node->sector != sector))
			
 
				+		return false;
			
 
				+
			
 
				+	next = rb_next(&node->node);
			
 
				+	if (unlikely(!next))
			
 
				+		return true;
			
 
				+
			
 
				+	next_node = container_of(next, struct journal_node, node);
			
 
				+	return next_node->sector != sector;
			
 
				+}
			
 
				+
			
 
				+static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
			
 
				+{
			
 
				+	struct rb_node *next;
			
 
				+	struct journal_node *next_node;
			
 
				+	unsigned next_section;
			
 
				+
			
 
				+	BUG_ON(RB_EMPTY_NODE(&node->node));
			
 
				+
			
 
				+	next = rb_next(&node->node);
			
 
				+	if (unlikely(!next))
			
 
				+		return false;
			
 
				+
			
 
				+	next_node = container_of(next, struct journal_node, node);
			
 
				+
			
 
				+	if (next_node->sector != node->sector)
			
 
				+		return false;
			
 
				+
			
 
				+	next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
			
 
				+	if (next_section >= ic->committed_section &&
			
 
				+	    next_section < ic->committed_section + ic->n_committed_sections)
			
 
				+		return true;
			
 
				+	if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
			
 
				+		return true;
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+#define TAG_READ	0
			
 
				+#define TAG_WRITE	1
			
 
				+#define TAG_CMP		2
			
 
				+
			
 
				+static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
			
 
				+			       unsigned *metadata_offset, unsigned total_size, int op)
			
 
				+{
			
 
				+	do {
			
 
				+		unsigned char *data, *dp;
			
 
				+		struct dm_buffer *b;
			
 
				+		unsigned to_copy;
			
 
				+		int r;
			
 
				+
			
 
				+		r = dm_integrity_failed(ic);
			
 
				+		if (unlikely(r))
			
 
				+			return r;
			
 
				+
			
 
				+		data = dm_bufio_read(ic->bufio, *metadata_block, &b);
			
 
				+		if (unlikely(IS_ERR(data)))
			
 
				+			return PTR_ERR(data);
			
 
				+
			
 
				+		to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
			
 
				+		dp = data + *metadata_offset;
			
 
				+		if (op == TAG_READ) {
			
 
				+			memcpy(tag, dp, to_copy);
			
 
				+		} else if (op == TAG_WRITE) {
			
 
				+			memcpy(dp, tag, to_copy);
			
 
				+			dm_bufio_mark_buffer_dirty(b);
			
 
				+		} else  {
			
 
				+			/* e.g.: op == TAG_CMP */
			
 
				+			if (unlikely(memcmp(dp, tag, to_copy))) {
			
 
				+				unsigned i;
			
 
				+
			
 
				+				for (i = 0; i < to_copy; i++) {
			
 
				+					if (dp[i] != tag[i])
			
 
				+						break;
			
 
				+					total_size--;
			
 
				+				}
			
 
				+				dm_bufio_release(b);
			
 
				+				return total_size;
			
 
				+			}
			
 
				+		}
			
 
				+		dm_bufio_release(b);
			
 
				+
			
 
				+		tag += to_copy;
			
 
				+		*metadata_offset += to_copy;
			
 
				+		if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
			
 
				+			(*metadata_block)++;
			
 
				+			*metadata_offset = 0;
			
 
				+		}
			
 
				+		total_size -= to_copy;
			
 
				+	} while (unlikely(total_size));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	int r;
			
 
				+	r = dm_bufio_write_dirty_buffers(ic->bufio);
			
 
				+	if (unlikely(r))
			
 
				+		dm_integrity_io_error(ic, "writing tags", r);
			
 
				+}
			
 
				+
			
 
				+static void sleep_on_endio_wait(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	DECLARE_WAITQUEUE(wait, current);
			
 
				+	__add_wait_queue(&ic->endio_wait, &wait);
			
 
				+	__set_current_state(TASK_UNINTERRUPTIBLE);
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+	io_schedule();
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+	__remove_wait_queue(&ic->endio_wait, &wait);
			
 
				+}
			
 
				+
			
 
				+static void autocommit_fn(unsigned long data)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
			
 
				+
			
 
				+	if (likely(!dm_integrity_failed(ic)))
			
 
				+		queue_work(ic->commit_wq, &ic->commit_work);
			
 
				+}
			
 
				+
			
 
				+static void schedule_autocommit(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	if (!timer_pending(&ic->autocommit_timer))
			
 
				+		mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
			
 
				+}
			
 
				+
			
 
				+static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+	bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
			
 
				+	bio_list_add(&ic->flush_bio_list, bio);
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+	queue_work(ic->commit_wq, &ic->commit_work);
			
 
				+}
			
 
				+
			
 
				+static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
			
 
				+{
			
 
				+	int r = dm_integrity_failed(ic);
			
 
				+	if (unlikely(r) && !bio->bi_error)
			
 
				+		bio->bi_error = r;
			
 
				+	bio_endio(bio);
			
 
				+}
			
 
				+
			
 
				+static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
			
 
				+{
			
 
				+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
			
 
				+
			
 
				+	if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic)))
			
 
				+		submit_flush_bio(ic, dio);
			
 
				+	else
			
 
				+		do_endio(ic, bio);
			
 
				+}
			
 
				+
			
 
				+static void dec_in_flight(struct dm_integrity_io *dio)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&dio->in_flight)) {
			
 
				+		struct dm_integrity_c *ic = dio->ic;
			
 
				+		struct bio *bio;
			
 
				+
			
 
				+		remove_range(ic, &dio->range);
			
 
				+
			
 
				+		if (unlikely(dio->write))
			
 
				+			schedule_autocommit(ic);
			
 
				+
			
 
				+		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
			
 
				+
			
 
				+		if (unlikely(dio->bi_error) && !bio->bi_error)
			
 
				+			bio->bi_error = dio->bi_error;
			
 
				+		if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
			
 
				+			dio->range.logical_sector += dio->range.n_sectors;
			
 
				+			bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
			
 
				+			INIT_WORK(&dio->work, integrity_bio_wait);
			
 
				+			queue_work(ic->wait_wq, &dio->work);
			
 
				+			return;
			
 
				+		}
			
 
				+		do_endio_flush(ic, dio);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void integrity_end_io(struct bio *bio)
			
 
				+{
			
 
				+	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
			
 
				+
			
 
				+	bio->bi_iter = dio->orig_bi_iter;
			
 
				+	bio->bi_bdev = dio->orig_bi_bdev;
			
 
				+	if (dio->orig_bi_integrity) {
			
 
				+		bio->bi_integrity = dio->orig_bi_integrity;
			
 
				+		bio->bi_opf |= REQ_INTEGRITY;
			
 
				+	}
			
 
				+	bio->bi_end_io = dio->orig_bi_end_io;
			
 
				+
			
 
				+	if (dio->completion)
			
 
				+		complete(dio->completion);
			
 
				+
			
 
				+	dec_in_flight(dio);
			
 
				+}
			
 
				+
			
 
				+static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
			
 
				+				      const char *data, char *result)
			
 
				+{
			
 
				+	__u64 sector_le = cpu_to_le64(sector);
			
 
				+	SHASH_DESC_ON_STACK(req, ic->internal_hash);
			
 
				+	int r;
			
 
				+	unsigned digest_size;
			
 
				+
			
 
				+	req->tfm = ic->internal_hash;
			
 
				+	req->flags = 0;
			
 
				+
			
 
				+	r = crypto_shash_init(req);
			
 
				+	if (unlikely(r < 0)) {
			
 
				+		dm_integrity_io_error(ic, "crypto_shash_init", r);
			
 
				+		goto failed;
			
 
				+	}
			
 
				+
			
 
				+	r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
			
 
				+	if (unlikely(r < 0)) {
			
 
				+		dm_integrity_io_error(ic, "crypto_shash_update", r);
			
 
				+		goto failed;
			
 
				+	}
			
 
				+
			
 
				+	r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
			
 
				+	if (unlikely(r < 0)) {
			
 
				+		dm_integrity_io_error(ic, "crypto_shash_update", r);
			
 
				+		goto failed;
			
 
				+	}
			
 
				+
			
 
				+	r = crypto_shash_final(req, result);
			
 
				+	if (unlikely(r < 0)) {
			
 
				+		dm_integrity_io_error(ic, "crypto_shash_final", r);
			
 
				+		goto failed;
			
 
				+	}
			
 
				+
			
 
				+	digest_size = crypto_shash_digestsize(ic->internal_hash);
			
 
				+	if (unlikely(digest_size < ic->tag_size))
			
 
				+		memset(result + digest_size, 0, ic->tag_size - digest_size);
			
 
				+
			
 
				+	return;
			
 
				+
			
 
				+failed:
			
 
				+	/* this shouldn't happen anyway, the hash functions have no reason to fail */
			
 
				+	get_random_bytes(result, ic->tag_size);
			
 
				+}
			
 
				+
			
 
				+static void integrity_metadata(struct work_struct *w)
			
 
				+{
			
 
				+	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
			
 
				+	struct dm_integrity_c *ic = dio->ic;
			
 
				+
			
 
				+	int r;
			
 
				+
			
 
				+	if (ic->internal_hash) {
			
 
				+		struct bvec_iter iter;
			
 
				+		struct bio_vec bv;
			
 
				+		unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
			
 
				+		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
			
 
				+		char *checksums;
			
 
				+		unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
			
 
				+		char checksums_onstack[ic->tag_size + extra_space];
			
 
				+		unsigned sectors_to_process = dio->range.n_sectors;
			
 
				+		sector_t sector = dio->range.logical_sector;
			
 
				+
			
 
				+		if (unlikely(ic->mode == 'R'))
			
 
				+			goto skip_io;
			
 
				+
			
 
				+		checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
			
 
				+				    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
			
 
				+		if (!checksums)
			
 
				+			checksums = checksums_onstack;
			
 
				+
			
 
				+		__bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
			
 
				+			unsigned pos;
			
 
				+			char *mem, *checksums_ptr;
			
 
				+
			
 
				+again:
			
 
				+			mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
			
 
				+			pos = 0;
			
 
				+			checksums_ptr = checksums;
			
 
				+			do {
			
 
				+				integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
			
 
				+				checksums_ptr += ic->tag_size;
			
 
				+				sectors_to_process -= ic->sectors_per_block;
			
 
				+				pos += ic->sectors_per_block << SECTOR_SHIFT;
			
 
				+				sector += ic->sectors_per_block;
			
 
				+			} while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
			
 
				+			kunmap_atomic(mem);
			
 
				+
			
 
				+			r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
			
 
				+						checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
			
 
				+			if (unlikely(r)) {
			
 
				+				if (r > 0) {
			
 
				+					DMERR("Checksum failed at sector 0x%llx",
			
 
				+					      (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
			
 
				+					r = -EILSEQ;
			
 
				+				}
			
 
				+				if (likely(checksums != checksums_onstack))
			
 
				+					kfree(checksums);
			
 
				+				goto error;
			
 
				+			}
			
 
				+
			
 
				+			if (!sectors_to_process)
			
 
				+				break;
			
 
				+
			
 
				+			if (unlikely(pos < bv.bv_len)) {
			
 
				+				bv.bv_offset += pos;
			
 
				+				bv.bv_len -= pos;
			
 
				+				goto again;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (likely(checksums != checksums_onstack))
			
 
				+			kfree(checksums);
			
 
				+	} else {
			
 
				+		struct bio_integrity_payload *bip = dio->orig_bi_integrity;
			
 
				+
			
 
				+		if (bip) {
			
 
				+			struct bio_vec biv;
			
 
				+			struct bvec_iter iter;
			
 
				+			unsigned data_to_process = dio->range.n_sectors;
			
 
				+			sector_to_block(ic, data_to_process);
			
 
				+			data_to_process *= ic->tag_size;
			
 
				+
			
 
				+			bip_for_each_vec(biv, bip, iter) {
			
 
				+				unsigned char *tag;
			
 
				+				unsigned this_len;
			
 
				+
			
 
				+				BUG_ON(PageHighMem(biv.bv_page));
			
 
				+				tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
			
 
				+				this_len = min(biv.bv_len, data_to_process);
			
 
				+				r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
			
 
				+							this_len, !dio->write ? TAG_READ : TAG_WRITE);
			
 
				+				if (unlikely(r))
			
 
				+					goto error;
			
 
				+				data_to_process -= this_len;
			
 
				+				if (!data_to_process)
			
 
				+					break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+skip_io:
			
 
				+	dec_in_flight(dio);
			
 
				+	return;
			
 
				+error:
			
 
				+	dio->bi_error = r;
			
 
				+	dec_in_flight(dio);
			
 
				+}
			
 
				+
			
 
				+static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = ti->private;
			
 
				+	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
			
 
				+	struct bio_integrity_payload *bip;
			
 
				+
			
 
				+	sector_t area, offset;
			
 
				+
			
 
				+	dio->ic = ic;
			
 
				+	dio->bi_error = 0;
			
 
				+
			
 
				+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
			
 
				+		submit_flush_bio(ic, dio);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
			
 
				+	dio->write = bio_op(bio) == REQ_OP_WRITE;
			
 
				+	dio->fua = dio->write && bio->bi_opf & REQ_FUA;
			
 
				+	if (unlikely(dio->fua)) {
			
 
				+		/*
			
 
				+		 * Don't pass down the FUA flag because we have to flush
			
 
				+		 * disk cache anyway.
			
 
				+		 */
			
 
				+		bio->bi_opf &= ~REQ_FUA;
			
 
				+	}
			
 
				+	if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
			
 
				+		DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
			
 
				+		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
			
 
				+		      (unsigned long long)ic->provided_data_sectors);
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+	if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
			
 
				+		DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
			
 
				+		      ic->sectors_per_block,
			
 
				+		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	if (ic->sectors_per_block > 1) {
			
 
				+		struct bvec_iter iter;
			
 
				+		struct bio_vec bv;
			
 
				+		bio_for_each_segment(bv, bio, iter) {
			
 
				+			if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
			
 
				+				DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
			
 
				+					bv.bv_offset, bv.bv_len, ic->sectors_per_block);
			
 
				+				return -EIO;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	bip = bio_integrity(bio);
			
 
				+	if (!ic->internal_hash) {
			
 
				+		if (bip) {
			
 
				+			unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
			
 
				+			if (ic->log2_tag_size >= 0)
			
 
				+				wanted_tag_size <<= ic->log2_tag_size;
			
 
				+			else
			
 
				+				wanted_tag_size *= ic->tag_size;
			
 
				+			if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
			
 
				+				DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
			
 
				+				return -EIO;
			
 
				+			}
			
 
				+		}
			
 
				+	} else {
			
 
				+		if (unlikely(bip != NULL)) {
			
 
				+			DMERR("Unexpected integrity data when using internal hash");
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(ic->mode == 'R') && unlikely(dio->write))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
			
 
				+	dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
			
 
				+	bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
			
 
				+
			
 
				+	dm_integrity_map_continue(dio, true);
			
 
				+	return DM_MAPIO_SUBMITTED;
			
 
				+}
			
 
				+
			
 
				+static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
			
 
				+				 unsigned journal_section, unsigned journal_entry)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = dio->ic;
			
 
				+	sector_t logical_sector;
			
 
				+	unsigned n_sectors;
			
 
				+
			
 
				+	logical_sector = dio->range.logical_sector;
			
 
				+	n_sectors = dio->range.n_sectors;
			
 
				+	do {
			
 
				+		struct bio_vec bv = bio_iovec(bio);
			
 
				+		char *mem;
			
 
				+
			
 
				+		if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
			
 
				+			bv.bv_len = n_sectors << SECTOR_SHIFT;
			
 
				+		n_sectors -= bv.bv_len >> SECTOR_SHIFT;
			
 
				+		bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
			
 
				+retry_kmap:
			
 
				+		mem = kmap_atomic(bv.bv_page);
			
 
				+		if (likely(dio->write))
			
 
				+			flush_dcache_page(bv.bv_page);
			
 
				+
			
 
				+		do {
			
 
				+			struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
			
 
				+
			
 
				+			if (unlikely(!dio->write)) {
			
 
				+				struct journal_sector *js;
			
 
				+				char *mem_ptr;
			
 
				+				unsigned s;
			
 
				+
			
 
				+				if (unlikely(journal_entry_is_inprogress(je))) {
			
 
				+					flush_dcache_page(bv.bv_page);
			
 
				+					kunmap_atomic(mem);
			
 
				+
			
 
				+					__io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
			
 
				+					goto retry_kmap;
			
 
				+				}
			
 
				+				smp_rmb();
			
 
				+				BUG_ON(journal_entry_get_sector(je) != logical_sector);
			
 
				+				js = access_journal_data(ic, journal_section, journal_entry);
			
 
				+				mem_ptr = mem + bv.bv_offset;
			
 
				+				s = 0;
			
 
				+				do {
			
 
				+					memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
			
 
				+					*(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
			
 
				+					js++;
			
 
				+					mem_ptr += 1 << SECTOR_SHIFT;
			
 
				+				} while (++s < ic->sectors_per_block);
			
 
				+#ifdef INTERNAL_VERIFY
			
 
				+				if (ic->internal_hash) {
			
 
				+					char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
			
 
				+
			
 
				+					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
			
 
				+					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
			
 
				+						DMERR("Checksum failed when reading from journal, at sector 0x%llx",
			
 
				+						      (unsigned long long)logical_sector);
			
 
				+					}
			
 
				+				}
			
 
				+#endif
			
 
				+			}
			
 
				+
			
 
				+			if (!ic->internal_hash) {
			
 
				+				struct bio_integrity_payload *bip = bio_integrity(bio);
			
 
				+				unsigned tag_todo = ic->tag_size;
			
 
				+				char *tag_ptr = journal_entry_tag(ic, je);
			
 
				+
			
 
				+				if (bip) do {
			
 
				+					struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
			
 
				+					unsigned tag_now = min(biv.bv_len, tag_todo);
			
 
				+					char *tag_addr;
			
 
				+					BUG_ON(PageHighMem(biv.bv_page));
			
 
				+					tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
			
 
				+					if (likely(dio->write))
			
 
				+						memcpy(tag_ptr, tag_addr, tag_now);
			
 
				+					else
			
 
				+						memcpy(tag_addr, tag_ptr, tag_now);
			
 
				+					bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
			
 
				+					tag_ptr += tag_now;
			
 
				+					tag_todo -= tag_now;
			
 
				+				} while (unlikely(tag_todo)); else {
			
 
				+					if (likely(dio->write))
			
 
				+						memset(tag_ptr, 0, tag_todo);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (likely(dio->write)) {
			
 
				+				struct journal_sector *js;
			
 
				+				unsigned s;
			
 
				+
			
 
				+				js = access_journal_data(ic, journal_section, journal_entry);
			
 
				+				memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
			
 
				+
			
 
				+				s = 0;
			
 
				+				do {
			
 
				+					je->last_bytes[s] = js[s].commit_id;
			
 
				+				} while (++s < ic->sectors_per_block);
			
 
				+
			
 
				+				if (ic->internal_hash) {
			
 
				+					unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
			
 
				+					if (unlikely(digest_size > ic->tag_size)) {
			
 
				+						char checksums_onstack[digest_size];
			
 
				+						integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
			
 
				+						memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
			
 
				+					} else
			
 
				+						integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
			
 
				+				}
			
 
				+
			
 
				+				journal_entry_set_sector(je, logical_sector);
			
 
				+			}
			
 
				+			logical_sector += ic->sectors_per_block;
			
 
				+
			
 
				+			journal_entry++;
			
 
				+			if (unlikely(journal_entry == ic->journal_section_entries)) {
			
 
				+				journal_entry = 0;
			
 
				+				journal_section++;
			
 
				+				wraparound_section(ic, &journal_section);
			
 
				+			}
			
 
				+
			
 
				+			bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
			
 
				+		} while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
			
 
				+
			
 
				+		if (unlikely(!dio->write))
			
 
				+			flush_dcache_page(bv.bv_page);
			
 
				+		kunmap_atomic(mem);
			
 
				+	} while (n_sectors);
			
 
				+
			
 
				+	if (likely(dio->write)) {
			
 
				+		smp_mb();
			
 
				+		if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
			
 
				+			wake_up(&ic->copy_to_journal_wait);
			
 
				+		if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
			
 
				+			queue_work(ic->commit_wq, &ic->commit_work);
			
 
				+		} else {
			
 
				+			schedule_autocommit(ic);
			
 
				+		}
			
 
				+	} else {
			
 
				+		remove_range(ic, &dio->range);
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(bio->bi_iter.bi_size)) {
			
 
				+		sector_t area, offset;
			
 
				+
			
 
				+		dio->range.logical_sector = logical_sector;
			
 
				+		get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
			
 
				+		dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = dio->ic;
			
 
				+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
			
 
				+	unsigned journal_section, journal_entry;
			
 
				+	unsigned journal_read_pos;
			
 
				+	struct completion read_comp;
			
 
				+	bool need_sync_io = ic->internal_hash && !dio->write;
			
 
				+
			
 
				+	if (need_sync_io && from_map) {
			
 
				+		INIT_WORK(&dio->work, integrity_bio_wait);
			
 
				+		queue_work(ic->metadata_wq, &dio->work);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+lock_retry:
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+retry:
			
 
				+	if (unlikely(dm_integrity_failed(ic))) {
			
 
				+		spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+		do_endio(ic, bio);
			
 
				+		return;
			
 
				+	}
			
 
				+	dio->range.n_sectors = bio_sectors(bio);
			
 
				+	journal_read_pos = NOT_FOUND;
			
 
				+	if (likely(ic->mode == 'J')) {
			
 
				+		if (dio->write) {
			
 
				+			unsigned next_entry, i, pos;
			
 
				+			unsigned ws, we;
			
 
				+
			
 
				+			dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
			
 
				+			if (unlikely(!dio->range.n_sectors))
			
 
				+				goto sleep;
			
 
				+			ic->free_sectors -= dio->range.n_sectors;
			
 
				+			journal_section = ic->free_section;
			
 
				+			journal_entry = ic->free_section_entry;
			
 
				+
			
 
				+			next_entry = ic->free_section_entry + dio->range.n_sectors;
			
 
				+			ic->free_section_entry = next_entry % ic->journal_section_entries;
			
 
				+			ic->free_section += next_entry / ic->journal_section_entries;
			
 
				+			ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
			
 
				+			wraparound_section(ic, &ic->free_section);
			
 
				+
			
 
				+			pos = journal_section * ic->journal_section_entries + journal_entry;
			
 
				+			ws = journal_section;
			
 
				+			we = journal_entry;
			
 
				+			i = 0;
			
 
				+			do {
			
 
				+				struct journal_entry *je;
			
 
				+
			
 
				+				add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
			
 
				+				pos++;
			
 
				+				if (unlikely(pos >= ic->journal_entries))
			
 
				+					pos = 0;
			
 
				+
			
 
				+				je = access_journal_entry(ic, ws, we);
			
 
				+				BUG_ON(!journal_entry_is_unused(je));
			
 
				+				journal_entry_set_inprogress(je);
			
 
				+				we++;
			
 
				+				if (unlikely(we == ic->journal_section_entries)) {
			
 
				+					we = 0;
			
 
				+					ws++;
			
 
				+					wraparound_section(ic, &ws);
			
 
				+				}
			
 
				+			} while ((i += ic->sectors_per_block) < dio->range.n_sectors);
			
 
				+
			
 
				+			spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+			goto journal_read_write;
			
 
				+		} else {
			
 
				+			sector_t next_sector;
			
 
				+			journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
			
 
				+			if (likely(journal_read_pos == NOT_FOUND)) {
			
 
				+				if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
			
 
				+					dio->range.n_sectors = next_sector - dio->range.logical_sector;
			
 
				+			} else {
			
 
				+				unsigned i;
			
 
				+				unsigned jp = journal_read_pos + 1;
			
 
				+				for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
			
 
				+					if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
			
 
				+						break;
			
 
				+				}
			
 
				+				dio->range.n_sectors = i;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (unlikely(!add_new_range(ic, &dio->range))) {
			
 
				+		/*
			
 
				+		 * We must not sleep in the request routine because it could
			
 
				+		 * stall bios on current->bio_list.
			
 
				+		 * So, we offload the bio to a workqueue if we have to sleep.
			
 
				+		 */
			
 
				+sleep:
			
 
				+		if (from_map) {
			
 
				+			spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+			INIT_WORK(&dio->work, integrity_bio_wait);
			
 
				+			queue_work(ic->wait_wq, &dio->work);
			
 
				+			return;
			
 
				+		} else {
			
 
				+			sleep_on_endio_wait(ic);
			
 
				+			goto retry;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+
			
 
				+	if (unlikely(journal_read_pos != NOT_FOUND)) {
			
 
				+		journal_section = journal_read_pos / ic->journal_section_entries;
			
 
				+		journal_entry = journal_read_pos % ic->journal_section_entries;
			
 
				+		goto journal_read_write;
			
 
				+	}
			
 
				+
			
 
				+	dio->in_flight = (atomic_t)ATOMIC_INIT(2);
			
 
				+
			
 
				+	if (need_sync_io) {
			
 
				+		read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp);
			
 
				+		dio->completion = &read_comp;
			
 
				+	} else
			
 
				+		dio->completion = NULL;
			
 
				+
			
 
				+	dio->orig_bi_iter = bio->bi_iter;
			
 
				+
			
 
				+	dio->orig_bi_bdev = bio->bi_bdev;
			
 
				+	bio->bi_bdev = ic->dev->bdev;
			
 
				+
			
 
				+	dio->orig_bi_integrity = bio_integrity(bio);
			
 
				+	bio->bi_integrity = NULL;
			
 
				+	bio->bi_opf &= ~REQ_INTEGRITY;
			
 
				+
			
 
				+	dio->orig_bi_end_io = bio->bi_end_io;
			
 
				+	bio->bi_end_io = integrity_end_io;
			
 
				+
			
 
				+	bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
			
 
				+	bio->bi_iter.bi_sector += ic->start;
			
 
				+	generic_make_request(bio);
			
 
				+
			
 
				+	if (need_sync_io) {
			
 
				+		wait_for_completion_io(&read_comp);
			
 
				+		integrity_metadata(&dio->work);
			
 
				+	} else {
			
 
				+		INIT_WORK(&dio->work, integrity_metadata);
			
 
				+		queue_work(ic->metadata_wq, &dio->work);
			
 
				+	}
			
 
				+
			
 
				+	return;
			
 
				+
			
 
				+journal_read_write:
			
 
				+	if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
			
 
				+		goto lock_retry;
			
 
				+
			
 
				+	do_endio_flush(ic, dio);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void integrity_bio_wait(struct work_struct *w)
			
 
				+{
			
 
				+	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
			
 
				+
			
 
				+	dm_integrity_map_continue(dio, false);
			
 
				+}
			
 
				+
			
 
				+static void pad_uncommitted(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	if (ic->free_section_entry) {
			
 
				+		ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
			
 
				+		ic->free_section_entry = 0;
			
 
				+		ic->free_section++;
			
 
				+		wraparound_section(ic, &ic->free_section);
			
 
				+		ic->n_uncommitted_sections++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void integrity_commit(struct work_struct *w)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
			
 
				+	unsigned commit_start, commit_sections;
			
 
				+	unsigned i, j, n;
			
 
				+	struct bio *flushes;
			
 
				+
			
 
				+	del_timer(&ic->autocommit_timer);
			
 
				+
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+	flushes = bio_list_get(&ic->flush_bio_list);
			
 
				+	if (unlikely(ic->mode != 'J')) {
			
 
				+		spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+		dm_integrity_flush_buffers(ic);
			
 
				+		goto release_flush_bios;
			
 
				+	}
			
 
				+
			
 
				+	pad_uncommitted(ic);
			
 
				+	commit_start = ic->uncommitted_section;
			
 
				+	commit_sections = ic->n_uncommitted_sections;
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+
			
 
				+	if (!commit_sections)
			
 
				+		goto release_flush_bios;
			
 
				+
			
 
				+	i = commit_start;
			
 
				+	for (n = 0; n < commit_sections; n++) {
			
 
				+		for (j = 0; j < ic->journal_section_entries; j++) {
			
 
				+			struct journal_entry *je;
			
 
				+			je = access_journal_entry(ic, i, j);
			
 
				+			io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
			
 
				+		}
			
 
				+		for (j = 0; j < ic->journal_section_sectors; j++) {
			
 
				+			struct journal_sector *js;
			
 
				+			js = access_journal(ic, i, j);
			
 
				+			js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
			
 
				+		}
			
 
				+		i++;
			
 
				+		if (unlikely(i >= ic->journal_sections))
			
 
				+			ic->commit_seq = next_commit_seq(ic->commit_seq);
			
 
				+		wraparound_section(ic, &i);
			
 
				+	}
			
 
				+	smp_rmb();
			
 
				+
			
 
				+	write_journal(ic, commit_start, commit_sections);
			
 
				+
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+	ic->uncommitted_section += commit_sections;
			
 
				+	wraparound_section(ic, &ic->uncommitted_section);
			
 
				+	ic->n_uncommitted_sections -= commit_sections;
			
 
				+	ic->n_committed_sections += commit_sections;
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+
			
 
				+	if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
			
 
				+		queue_work(ic->writer_wq, &ic->writer_work);
			
 
				+
			
 
				+release_flush_bios:
			
 
				+	while (flushes) {
			
 
				+		struct bio *next = flushes->bi_next;
			
 
				+		flushes->bi_next = NULL;
			
 
				+		do_endio(ic, flushes);
			
 
				+		flushes = next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void complete_copy_from_journal(unsigned long error, void *context)
			
 
				+{
			
 
				+	struct journal_io *io = context;
			
 
				+	struct journal_completion *comp = io->comp;
			
 
				+	struct dm_integrity_c *ic = comp->ic;
			
 
				+	remove_range(ic, &io->range);
			
 
				+	mempool_free(io, ic->journal_io_mempool);
			
 
				+	if (unlikely(error != 0))
			
 
				+		dm_integrity_io_error(ic, "copying from journal", -EIO);
			
 
				+	complete_journal_op(comp);
			
 
				+}
			
 
				+
			
 
				+static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
			
 
				+			       struct journal_entry *je)
			
 
				+{
			
 
				+	unsigned s = 0;
			
 
				+	do {
			
 
				+		js->commit_id = je->last_bytes[s];
			
 
				+		js++;
			
 
				+	} while (++s < ic->sectors_per_block);
			
 
				+}
			
 
				+
			
 
				+static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
			
 
				+			     unsigned write_sections, bool from_replay)
			
 
				+{
			
 
				+	unsigned i, j, n;
			
 
				+	struct journal_completion comp;
			
 
				+
			
 
				+	comp.ic = ic;
			
 
				+	comp.in_flight = (atomic_t)ATOMIC_INIT(1);
			
 
				+	comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
			
 
				+
			
 
				+	i = write_start;
			
 
				+	for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
			
 
				+#ifndef INTERNAL_VERIFY
			
 
				+		if (unlikely(from_replay))
			
 
				+#endif
			
 
				+			rw_section_mac(ic, i, false);
			
 
				+		for (j = 0; j < ic->journal_section_entries; j++) {
			
 
				+			struct journal_entry *je = access_journal_entry(ic, i, j);
			
 
				+			sector_t sec, area, offset;
			
 
				+			unsigned k, l, next_loop;
			
 
				+			sector_t metadata_block;
			
 
				+			unsigned metadata_offset;
			
 
				+			struct journal_io *io;
			
 
				+
			
 
				+			if (journal_entry_is_unused(je))
			
 
				+				continue;
			
 
				+			BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
			
 
				+			sec = journal_entry_get_sector(je);
			
 
				+			if (unlikely(from_replay)) {
			
 
				+				if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
			
 
				+					dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
			
 
				+					sec &= ~(sector_t)(ic->sectors_per_block - 1);
			
 
				+				}
			
 
				+			}
			
 
				+			get_area_and_offset(ic, sec, &area, &offset);
			
 
				+			restore_last_bytes(ic, access_journal_data(ic, i, j), je);
			
 
				+			for (k = j + 1; k < ic->journal_section_entries; k++) {
			
 
				+				struct journal_entry *je2 = access_journal_entry(ic, i, k);
			
 
				+				sector_t sec2, area2, offset2;
			
 
				+				if (journal_entry_is_unused(je2))
			
 
				+					break;
			
 
				+				BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
			
 
				+				sec2 = journal_entry_get_sector(je2);
			
 
				+				get_area_and_offset(ic, sec2, &area2, &offset2);
			
 
				+				if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
			
 
				+					break;
			
 
				+				restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
			
 
				+			}
			
 
				+			next_loop = k - 1;
			
 
				+
			
 
				+			io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
			
 
				+			io->comp = &comp;
			
 
				+			io->range.logical_sector = sec;
			
 
				+			io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
			
 
				+
			
 
				+			spin_lock_irq(&ic->endio_wait.lock);
			
 
				+			while (unlikely(!add_new_range(ic, &io->range)))
			
 
				+				sleep_on_endio_wait(ic);
			
 
				+
			
 
				+			if (likely(!from_replay)) {
			
 
				+				struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
			
 
				+
			
 
				+				/* don't write if there is newer committed sector */
			
 
				+				while (j < k && find_newer_committed_node(ic, &section_node[j])) {
			
 
				+					struct journal_entry *je2 = access_journal_entry(ic, i, j);
			
 
				+
			
 
				+					journal_entry_set_unused(je2);
			
 
				+					remove_journal_node(ic, &section_node[j]);
			
 
				+					j++;
			
 
				+					sec += ic->sectors_per_block;
			
 
				+					offset += ic->sectors_per_block;
			
 
				+				}
			
 
				+				while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
			
 
				+					struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
			
 
				+
			
 
				+					journal_entry_set_unused(je2);
			
 
				+					remove_journal_node(ic, &section_node[k - 1]);
			
 
				+					k--;
			
 
				+				}
			
 
				+				if (j == k) {
			
 
				+					remove_range_unlocked(ic, &io->range);
			
 
				+					spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+					mempool_free(io, ic->journal_io_mempool);
			
 
				+					goto skip_io;
			
 
				+				}
			
 
				+				for (l = j; l < k; l++) {
			
 
				+					remove_journal_node(ic, &section_node[l]);
			
 
				+				}
			
 
				+			}
			
 
				+			spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+
			
 
				+			metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
			
 
				+			for (l = j; l < k; l++) {
			
 
				+				int r;
			
 
				+				struct journal_entry *je2 = access_journal_entry(ic, i, l);
			
 
				+
			
 
				+				if (
			
 
				+#ifndef INTERNAL_VERIFY
			
 
				+				    unlikely(from_replay) &&
			
 
				+#endif
			
 
				+				    ic->internal_hash) {
			
 
				+					char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
			
 
				+
			
 
				+					integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
			
 
				+								  (char *)access_journal_data(ic, i, l), test_tag);
			
 
				+					if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
			
 
				+						dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
			
 
				+				}
			
 
				+
			
 
				+				journal_entry_set_unused(je2);
			
 
				+				r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
			
 
				+							ic->tag_size, TAG_WRITE);
			
 
				+				if (unlikely(r)) {
			
 
				+					dm_integrity_io_error(ic, "reading tags", r);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			atomic_inc(&comp.in_flight);
			
 
				+			copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
			
 
				+					  (k - j) << ic->sb->log2_sectors_per_block,
			
 
				+					  get_data_sector(ic, area, offset),
			
 
				+					  complete_copy_from_journal, io);
			
 
				+skip_io:
			
 
				+			j = next_loop;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dm_bufio_write_dirty_buffers_async(ic->bufio);
			
 
				+
			
 
				+	complete_journal_op(&comp);
			
 
				+	wait_for_completion_io(&comp.comp);
			
 
				+
			
 
				+	dm_integrity_flush_buffers(ic);
			
 
				+}
			
 
				+
			
 
				+static void integrity_writer(struct work_struct *w)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
			
 
				+	unsigned write_start, write_sections;
			
 
				+
			
 
				+	unsigned prev_free_sectors;
			
 
				+
			
 
				+	/* the following test is not needed, but it tests the replay code */
			
 
				+	if (ACCESS_ONCE(ic->suspending))
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+	write_start = ic->committed_section;
			
 
				+	write_sections = ic->n_committed_sections;
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+
			
 
				+	if (!write_sections)
			
 
				+		return;
			
 
				+
			
 
				+	do_journal_write(ic, write_start, write_sections, false);
			
 
				+
			
 
				+	spin_lock_irq(&ic->endio_wait.lock);
			
 
				+
			
 
				+	ic->committed_section += write_sections;
			
 
				+	wraparound_section(ic, &ic->committed_section);
			
 
				+	ic->n_committed_sections -= write_sections;
			
 
				+
			
 
				+	prev_free_sectors = ic->free_sectors;
			
 
				+	ic->free_sectors += write_sections * ic->journal_section_entries;
			
 
				+	if (unlikely(!prev_free_sectors))
			
 
				+		wake_up_locked(&ic->endio_wait);
			
 
				+
			
 
				+	spin_unlock_irq(&ic->endio_wait.lock);
			
 
				+}
			
 
				+
			
 
				+static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
			
 
				+			 unsigned n_sections, unsigned char commit_seq)
			
 
				+{
			
 
				+	unsigned i, j, n;
			
 
				+
			
 
				+	if (!n_sections)
			
 
				+		return;
			
 
				+
			
 
				+	for (n = 0; n < n_sections; n++) {
			
 
				+		i = start_section + n;
			
 
				+		wraparound_section(ic, &i);
			
 
				+		for (j = 0; j < ic->journal_section_sectors; j++) {
			
 
				+			struct journal_sector *js = access_journal(ic, i, j);
			
 
				+			memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
			
 
				+			js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
			
 
				+		}
			
 
				+		for (j = 0; j < ic->journal_section_entries; j++) {
			
 
				+			struct journal_entry *je = access_journal_entry(ic, i, j);
			
 
				+			journal_entry_set_unused(je);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	write_journal(ic, start_section, n_sections);
			
 
				+}
			
 
				+
			
 
				+static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
			
 
				+{
			
 
				+	unsigned char k;
			
 
				+	for (k = 0; k < N_COMMIT_IDS; k++) {
			
 
				+		if (dm_integrity_commit_id(ic, i, j, k) == id)
			
 
				+			return k;
			
 
				+	}
			
 
				+	dm_integrity_io_error(ic, "journal commit id", -EIO);
			
 
				+	return -EIO;
			
 
				+}
			
 
				+
			
 
				+static void replay_journal(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+	bool used_commit_ids[N_COMMIT_IDS];
			
 
				+	unsigned max_commit_id_sections[N_COMMIT_IDS];
			
 
				+	unsigned write_start, write_sections;
			
 
				+	unsigned continue_section;
			
 
				+	bool journal_empty;
			
 
				+	unsigned char unused, last_used, want_commit_seq;
			
 
				+
			
 
				+	if (ic->mode == 'R')
			
 
				+		return;
			
 
				+
			
 
				+	if (ic->journal_uptodate)
			
 
				+		return;
			
 
				+
			
 
				+	last_used = 0;
			
 
				+	write_start = 0;
			
 
				+
			
 
				+	if (!ic->just_formatted) {
			
 
				+		DEBUG_print("reading journal\n");
			
 
				+		rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
			
 
				+		if (ic->journal_io)
			
 
				+			DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
			
 
				+		if (ic->journal_io) {
			
 
				+			struct journal_completion crypt_comp;
			
 
				+			crypt_comp.ic = ic;
			
 
				+			crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp);
			
 
				+			crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
			
 
				+			encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
			
 
				+			wait_for_completion(&crypt_comp.comp);
			
 
				+		}
			
 
				+		DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
			
 
				+	}
			
 
				+
			
 
				+	if (dm_integrity_failed(ic))
			
 
				+		goto clear_journal;
			
 
				+
			
 
				+	journal_empty = true;
			
 
				+	memset(used_commit_ids, 0, sizeof used_commit_ids);
			
 
				+	memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
			
 
				+	for (i = 0; i < ic->journal_sections; i++) {
			
 
				+		for (j = 0; j < ic->journal_section_sectors; j++) {
			
 
				+			int k;
			
 
				+			struct journal_sector *js = access_journal(ic, i, j);
			
 
				+			k = find_commit_seq(ic, i, j, js->commit_id);
			
 
				+			if (k < 0)
			
 
				+				goto clear_journal;
			
 
				+			used_commit_ids[k] = true;
			
 
				+			max_commit_id_sections[k] = i;
			
 
				+		}
			
 
				+		if (journal_empty) {
			
 
				+			for (j = 0; j < ic->journal_section_entries; j++) {
			
 
				+				struct journal_entry *je = access_journal_entry(ic, i, j);
			
 
				+				if (!journal_entry_is_unused(je)) {
			
 
				+					journal_empty = false;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!used_commit_ids[N_COMMIT_IDS - 1]) {
			
 
				+		unused = N_COMMIT_IDS - 1;
			
 
				+		while (unused && !used_commit_ids[unused - 1])
			
 
				+			unused--;
			
 
				+	} else {
			
 
				+		for (unused = 0; unused < N_COMMIT_IDS; unused++)
			
 
				+			if (!used_commit_ids[unused])
			
 
				+				break;
			
 
				+		if (unused == N_COMMIT_IDS) {
			
 
				+			dm_integrity_io_error(ic, "journal commit ids", -EIO);
			
 
				+			goto clear_journal;
			
 
				+		}
			
 
				+	}
			
 
				+	DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
			
 
				+		    unused, used_commit_ids[0], used_commit_ids[1],
			
 
				+		    used_commit_ids[2], used_commit_ids[3]);
			
 
				+
			
 
				+	last_used = prev_commit_seq(unused);
			
 
				+	want_commit_seq = prev_commit_seq(last_used);
			
 
				+
			
 
				+	if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
			
 
				+		journal_empty = true;
			
 
				+
			
 
				+	write_start = max_commit_id_sections[last_used] + 1;
			
 
				+	if (unlikely(write_start >= ic->journal_sections))
			
 
				+		want_commit_seq = next_commit_seq(want_commit_seq);
			
 
				+	wraparound_section(ic, &write_start);
			
 
				+
			
 
				+	i = write_start;
			
 
				+	for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
			
 
				+		for (j = 0; j < ic->journal_section_sectors; j++) {
			
 
				+			struct journal_sector *js = access_journal(ic, i, j);
			
 
				+
			
 
				+			if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
			
 
				+				/*
			
 
				+				 * This could be caused by crash during writing.
			
 
				+				 * We won't replay the inconsistent part of the
			
 
				+				 * journal.
			
 
				+				 */
			
 
				+				DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
			
 
				+					    i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
			
 
				+				goto brk;
			
 
				+			}
			
 
				+		}
			
 
				+		i++;
			
 
				+		if (unlikely(i >= ic->journal_sections))
			
 
				+			want_commit_seq = next_commit_seq(want_commit_seq);
			
 
				+		wraparound_section(ic, &i);
			
 
				+	}
			
 
				+brk:
			
 
				+
			
 
				+	if (!journal_empty) {
			
 
				+		DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
			
 
				+			    write_sections, write_start, want_commit_seq);
			
 
				+		do_journal_write(ic, write_start, write_sections, true);
			
 
				+	}
			
 
				+
			
 
				+	if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
			
 
				+		continue_section = write_start;
			
 
				+		ic->commit_seq = want_commit_seq;
			
 
				+		DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
			
 
				+	} else {
			
 
				+		unsigned s;
			
 
				+		unsigned char erase_seq;
			
 
				+clear_journal:
			
 
				+		DEBUG_print("clearing journal\n");
			
 
				+
			
 
				+		erase_seq = prev_commit_seq(prev_commit_seq(last_used));
			
 
				+		s = write_start;
			
 
				+		init_journal(ic, s, 1, erase_seq);
			
 
				+		s++;
			
 
				+		wraparound_section(ic, &s);
			
 
				+		if (ic->journal_sections >= 2) {
			
 
				+			init_journal(ic, s, ic->journal_sections - 2, erase_seq);
			
 
				+			s += ic->journal_sections - 2;
			
 
				+			wraparound_section(ic, &s);
			
 
				+			init_journal(ic, s, 1, erase_seq);
			
 
				+		}
			
 
				+
			
 
				+		continue_section = 0;
			
 
				+		ic->commit_seq = next_commit_seq(erase_seq);
			
 
				+	}
			
 
				+
			
 
				+	ic->committed_section = continue_section;
			
 
				+	ic->n_committed_sections = 0;
			
 
				+
			
 
				+	ic->uncommitted_section = continue_section;
			
 
				+	ic->n_uncommitted_sections = 0;
			
 
				+
			
 
				+	ic->free_section = continue_section;
			
 
				+	ic->free_section_entry = 0;
			
 
				+	ic->free_sectors = ic->journal_entries;
			
 
				+
			
 
				+	ic->journal_tree_root = RB_ROOT;
			
 
				+	for (i = 0; i < ic->journal_entries; i++)
			
 
				+		init_journal_node(&ic->journal_tree[i]);
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_postsuspend(struct dm_target *ti)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
			
 
				+
			
 
				+	del_timer_sync(&ic->autocommit_timer);
			
 
				+
			
 
				+	ic->suspending = true;
			
 
				+
			
 
				+	queue_work(ic->commit_wq, &ic->commit_work);
			
 
				+	drain_workqueue(ic->commit_wq);
			
 
				+
			
 
				+	if (ic->mode == 'J') {
			
 
				+		drain_workqueue(ic->writer_wq);
			
 
				+		dm_integrity_flush_buffers(ic);
			
 
				+	}
			
 
				+
			
 
				+	ic->suspending = false;
			
 
				+
			
 
				+	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
			
 
				+
			
 
				+	ic->journal_uptodate = true;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_resume(struct dm_target *ti)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
			
 
				+
			
 
				+	replay_journal(ic);
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_status(struct dm_target *ti, status_type_t type,
			
 
				+				unsigned status_flags, char *result, unsigned maxlen)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
			
 
				+	unsigned arg_count;
			
 
				+	size_t sz = 0;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case STATUSTYPE_INFO:
			
 
				+		result[0] = '\0';
			
 
				+		break;
			
 
				+
			
 
				+	case STATUSTYPE_TABLE: {
			
 
				+		__u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
			
 
				+		watermark_percentage += ic->journal_entries / 2;
			
 
				+		do_div(watermark_percentage, ic->journal_entries);
			
 
				+		arg_count = 5;
			
 
				+		arg_count += ic->sectors_per_block != 1;
			
 
				+		arg_count += !!ic->internal_hash_alg.alg_string;
			
 
				+		arg_count += !!ic->journal_crypt_alg.alg_string;
			
 
				+		arg_count += !!ic->journal_mac_alg.alg_string;
			
 
				+		DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
			
 
				+		       ic->tag_size, ic->mode, arg_count);
			
 
				+		DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
			
 
				+		DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
			
 
				+		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
			
 
				+		DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
			
 
				+		DMEMIT(" commit_time:%u", ic->autocommit_msec);
			
 
				+		if (ic->sectors_per_block != 1)
			
 
				+			DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
			
 
				+
			
 
				+#define EMIT_ALG(a, n)							\
			
 
				+		do {							\
			
 
				+			if (ic->a.alg_string) {				\
			
 
				+				DMEMIT(" %s:%s", n, ic->a.alg_string);	\
			
 
				+				if (ic->a.key_string)			\
			
 
				+					DMEMIT(":%s", ic->a.key_string);\
			
 
				+			}						\
			
 
				+		} while (0)
			
 
				+		EMIT_ALG(internal_hash_alg, "internal_hash");
			
 
				+		EMIT_ALG(journal_crypt_alg, "journal_crypt");
			
 
				+		EMIT_ALG(journal_mac_alg, "journal_mac");
			
 
				+		break;
			
 
				+	}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int dm_integrity_iterate_devices(struct dm_target *ti,
			
 
				+					iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = ti->private;
			
 
				+
			
 
				+	return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = ti->private;
			
 
				+
			
 
				+	if (ic->sectors_per_block > 1) {
			
 
				+		limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
			
 
				+		limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
			
 
				+		blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void calculate_journal_section_size(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	unsigned sector_space = JOURNAL_SECTOR_DATA;
			
 
				+
			
 
				+	ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
			
 
				+	ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
			
 
				+					 JOURNAL_ENTRY_ROUNDUP);
			
 
				+
			
 
				+	if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
			
 
				+		sector_space -= JOURNAL_MAC_PER_SECTOR;
			
 
				+	ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
			
 
				+	ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
			
 
				+	ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
			
 
				+	ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
			
 
				+}
			
 
				+
			
 
				+static int calculate_device_limits(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	__u64 initial_sectors;
			
 
				+	sector_t last_sector, last_area, last_offset;
			
 
				+
			
 
				+	calculate_journal_section_size(ic);
			
 
				+	initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
			
 
				+	if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
			
 
				+		return -EINVAL;
			
 
				+	ic->initial_sectors = initial_sectors;
			
 
				+
			
 
				+	ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
			
 
				+				   (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
			
 
				+	if (!(ic->metadata_run & (ic->metadata_run - 1)))
			
 
				+		ic->log2_metadata_run = __ffs(ic->metadata_run);
			
 
				+	else
			
 
				+		ic->log2_metadata_run = -1;
			
 
				+
			
 
				+	get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
			
 
				+	last_sector = get_data_sector(ic, last_area, last_offset);
			
 
				+
			
 
				+	if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
			
 
				+{
			
 
				+	unsigned journal_sections;
			
 
				+	int test_bit;
			
 
				+
			
 
				+	memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
			
 
				+	memcpy(ic->sb->magic, SB_MAGIC, 8);
			
 
				+	ic->sb->version = SB_VERSION;
			
 
				+	ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
			
 
				+	ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
			
 
				+	if (ic->journal_mac_alg.alg_string)
			
 
				+		ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
			
 
				+
			
 
				+	calculate_journal_section_size(ic);
			
 
				+	journal_sections = journal_sectors / ic->journal_section_sectors;
			
 
				+	if (!journal_sections)
			
 
				+		journal_sections = 1;
			
 
				+	ic->sb->journal_sections = cpu_to_le32(journal_sections);
			
 
				+
			
 
				+	if (!interleave_sectors)
			
 
				+		interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
			
 
				+	ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
			
 
				+	ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
			
 
				+	ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
			
 
				+
			
 
				+	ic->provided_data_sectors = 0;
			
 
				+	for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
			
 
				+		__u64 prev_data_sectors = ic->provided_data_sectors;
			
 
				+
			
 
				+		ic->provided_data_sectors |= (sector_t)1 << test_bit;
			
 
				+		if (calculate_device_limits(ic))
			
 
				+			ic->provided_data_sectors = prev_data_sectors;
			
 
				+	}
			
 
				+
			
 
				+	if (!ic->provided_data_sectors)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
			
 
				+	struct blk_integrity bi;
			
 
				+
			
 
				+	memset(&bi, 0, sizeof(bi));
			
 
				+	bi.profile = &dm_integrity_profile;
			
 
				+	bi.tuple_size = ic->tag_size;
			
 
				+	bi.tag_size = bi.tuple_size;
			
 
				+	bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
			
 
				+
			
 
				+	blk_integrity_register(disk, &bi);
			
 
				+	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
			
 
				+}
			
 
				+
			
 
				+/* FIXME: use new kvmalloc */
			
 
				+static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp)
			
 
				+{
			
 
				+	void *ptr = NULL;
			
 
				+
			
 
				+	if (size <= PAGE_SIZE)
			
 
				+		ptr = kmalloc(size, GFP_KERNEL | gfp);
			
 
				+	if (!ptr && size <= KMALLOC_MAX_SIZE)
			
 
				+		ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp);
			
 
				+	if (!ptr)
			
 
				+		ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL);
			
 
				+
			
 
				+	return ptr;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	if (!pl)
			
 
				+		return;
			
 
				+	for (i = 0; i < ic->journal_pages; i++)
			
 
				+		if (pl[i].page)
			
 
				+			__free_page(pl[i].page);
			
 
				+	kvfree(pl);
			
 
				+}
			
 
				+
			
 
				+static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
			
 
				+{
			
 
				+	size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
			
 
				+	struct page_list *pl;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO);
			
 
				+	if (!pl)
			
 
				+		return NULL;
			
 
				+
			
 
				+	for (i = 0; i < ic->journal_pages; i++) {
			
 
				+		pl[i].page = alloc_page(GFP_KERNEL);
			
 
				+		if (!pl[i].page) {
			
 
				+			dm_integrity_free_page_list(ic, pl);
			
 
				+			return NULL;
			
 
				+		}
			
 
				+		if (i)
			
 
				+			pl[i - 1].next = &pl[i];
			
 
				+	}
			
 
				+
			
 
				+	return pl;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < ic->journal_sections; i++)
			
 
				+		kvfree(sl[i]);
			
 
				+	kfree(sl);
			
 
				+}
			
 
				+
			
 
				+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
			
 
				+{
			
 
				+	struct scatterlist **sl;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO);
			
 
				+	if (!sl)
			
 
				+		return NULL;
			
 
				+
			
 
				+	for (i = 0; i < ic->journal_sections; i++) {
			
 
				+		struct scatterlist *s;
			
 
				+		unsigned start_index, start_offset;
			
 
				+		unsigned end_index, end_offset;
			
 
				+		unsigned n_pages;
			
 
				+		unsigned idx;
			
 
				+
			
 
				+		page_list_location(ic, i, 0, &start_index, &start_offset);
			
 
				+		page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
			
 
				+
			
 
				+		n_pages = (end_index - start_index + 1);
			
 
				+
			
 
				+		s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0);
			
 
				+		if (!s) {
			
 
				+			dm_integrity_free_journal_scatterlist(ic, sl);
			
 
				+			return NULL;
			
 
				+		}
			
 
				+
			
 
				+		sg_init_table(s, n_pages);
			
 
				+		for (idx = start_index; idx <= end_index; idx++) {
			
 
				+			char *va = lowmem_page_address(pl[idx].page);
			
 
				+			unsigned start = 0, end = PAGE_SIZE;
			
 
				+			if (idx == start_index)
			
 
				+				start = start_offset;
			
 
				+			if (idx == end_index)
			
 
				+				end = end_offset + (1 << SECTOR_SHIFT);
			
 
				+			sg_set_buf(&s[idx - start_index], va + start, end - start);
			
 
				+		}
			
 
				+
			
 
				+		sl[i] = s;
			
 
				+	}
			
 
				+
			
 
				+	return sl;
			
 
				+}
			
 
				+
			
 
				+static void free_alg(struct alg_spec *a)
			
 
				+{
			
 
				+	kzfree(a->alg_string);
			
 
				+	kzfree(a->key);
			
 
				+	memset(a, 0, sizeof *a);
			
 
				+}
			
 
				+
			
 
				+static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
			
 
				+{
			
 
				+	char *k;
			
 
				+
			
 
				+	free_alg(a);
			
 
				+
			
 
				+	a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
			
 
				+	if (!a->alg_string)
			
 
				+		goto nomem;
			
 
				+
			
 
				+	k = strchr(a->alg_string, ':');
			
 
				+	if (k) {
			
 
				+		*k = 0;
			
 
				+		a->key_string = k + 1;
			
 
				+		if (strlen(a->key_string) & 1)
			
 
				+			goto inval;
			
 
				+
			
 
				+		a->key_size = strlen(a->key_string) / 2;
			
 
				+		a->key = kmalloc(a->key_size, GFP_KERNEL);
			
 
				+		if (!a->key)
			
 
				+			goto nomem;
			
 
				+		if (hex2bin(a->key, a->key_string, a->key_size))
			
 
				+			goto inval;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+inval:
			
 
				+	*error = error_inval;
			
 
				+	return -EINVAL;
			
 
				+nomem:
			
 
				+	*error = "Out of memory for an argument";
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
			
 
				+		   char *error_alg, char *error_key)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (a->alg_string) {
			
 
				+		*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
			
 
				+		if (IS_ERR(*hash)) {
			
 
				+			*error = error_alg;
			
 
				+			r = PTR_ERR(*hash);
			
 
				+			*hash = NULL;
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+		if (a->key) {
			
 
				+			r = crypto_shash_setkey(*hash, a->key, a->key_size);
			
 
				+			if (r) {
			
 
				+				*error = error_key;
			
 
				+				return r;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int create_journal(struct dm_integrity_c *ic, char **error)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	unsigned i;
			
 
				+	__u64 journal_pages, journal_desc_size, journal_tree_size;
			
 
				+	unsigned char *crypt_data = NULL;
			
 
				+
			
 
				+	ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
			
 
				+	ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
			
 
				+	ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
			
 
				+	ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
			
 
				+
			
 
				+	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
			
 
				+				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
			
 
				+	journal_desc_size = journal_pages * sizeof(struct page_list);
			
 
				+	if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
			
 
				+		*error = "Journal doesn't fit into memory";
			
 
				+		r = -ENOMEM;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	ic->journal_pages = journal_pages;
			
 
				+
			
 
				+	ic->journal = dm_integrity_alloc_page_list(ic);
			
 
				+	if (!ic->journal) {
			
 
				+		*error = "Could not allocate memory for journal";
			
 
				+		r = -ENOMEM;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	if (ic->journal_crypt_alg.alg_string) {
			
 
				+		unsigned ivsize, blocksize;
			
 
				+		struct journal_completion comp;
			
 
				+
			
 
				+		comp.ic = ic;
			
 
				+		ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
			
 
				+		if (IS_ERR(ic->journal_crypt)) {
			
 
				+			*error = "Invalid journal cipher";
			
 
				+			r = PTR_ERR(ic->journal_crypt);
			
 
				+			ic->journal_crypt = NULL;
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
			
 
				+		blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
			
 
				+
			
 
				+		if (ic->journal_crypt_alg.key) {
			
 
				+			r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
			
 
				+						   ic->journal_crypt_alg.key_size);
			
 
				+			if (r) {
			
 
				+				*error = "Error setting encryption key";
			
 
				+				goto bad;
			
 
				+			}
			
 
				+		}
			
 
				+		DEBUG_print("cipher %s, block size %u iv size %u\n",
			
 
				+			    ic->journal_crypt_alg.alg_string, blocksize, ivsize);
			
 
				+
			
 
				+		ic->journal_io = dm_integrity_alloc_page_list(ic);
			
 
				+		if (!ic->journal_io) {
			
 
				+			*error = "Could not allocate memory for journal io";
			
 
				+			r = -ENOMEM;
			
 
				+			goto bad;
			
 
				+		}
			
 
				+
			
 
				+		if (blocksize == 1) {
			
 
				+			struct scatterlist *sg;
			
 
				+			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
			
 
				+			unsigned char iv[ivsize];
			
 
				+			skcipher_request_set_tfm(req, ic->journal_crypt);
			
 
				+
			
 
				+			ic->journal_xor = dm_integrity_alloc_page_list(ic);
			
 
				+			if (!ic->journal_xor) {
			
 
				+				*error = "Could not allocate memory for journal xor";
			
 
				+				r = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+
			
 
				+			sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
			
 
				+			if (!sg) {
			
 
				+				*error = "Unable to allocate sg list";
			
 
				+				r = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			sg_init_table(sg, ic->journal_pages + 1);
			
 
				+			for (i = 0; i < ic->journal_pages; i++) {
			
 
				+				char *va = lowmem_page_address(ic->journal_xor[i].page);
			
 
				+				clear_page(va);
			
 
				+				sg_set_buf(&sg[i], va, PAGE_SIZE);
			
 
				+			}
			
 
				+			sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
			
 
				+			memset(iv, 0x00, ivsize);
			
 
				+
			
 
				+			skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
			
 
				+			comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
			
 
				+			comp.in_flight = (atomic_t)ATOMIC_INIT(1);
			
 
				+			if (do_crypt(true, req, &comp))
			
 
				+				wait_for_completion(&comp.comp);
			
 
				+			kvfree(sg);
			
 
				+			r = dm_integrity_failed(ic);
			
 
				+			if (r) {
			
 
				+				*error = "Unable to encrypt journal";
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
			
 
				+
			
 
				+			crypto_free_skcipher(ic->journal_crypt);
			
 
				+			ic->journal_crypt = NULL;
			
 
				+		} else {
			
 
				+			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
			
 
				+			unsigned char iv[ivsize];
			
 
				+			unsigned crypt_len = roundup(ivsize, blocksize);
			
 
				+
			
 
				+			crypt_data = kmalloc(crypt_len, GFP_KERNEL);
			
 
				+			if (!crypt_data) {
			
 
				+				*error = "Unable to allocate crypt data";
			
 
				+				r = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+
			
 
				+			skcipher_request_set_tfm(req, ic->journal_crypt);
			
 
				+
			
 
				+			ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
			
 
				+			if (!ic->journal_scatterlist) {
			
 
				+				*error = "Unable to allocate sg list";
			
 
				+				r = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
			
 
				+			if (!ic->journal_io_scatterlist) {
			
 
				+				*error = "Unable to allocate sg list";
			
 
				+				r = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
			
 
				+			if (!ic->sk_requests) {
			
 
				+				*error = "Unable to allocate sk requests";
			
 
				+				r = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			for (i = 0; i < ic->journal_sections; i++) {
			
 
				+				struct scatterlist sg;
			
 
				+				struct skcipher_request *section_req;
			
 
				+				__u32 section_le = cpu_to_le32(i);
			
 
				+
			
 
				+				memset(iv, 0x00, ivsize);
			
 
				+				memset(crypt_data, 0x00, crypt_len);
			
 
				+				memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
			
 
				+
			
 
				+				sg_init_one(&sg, crypt_data, crypt_len);
			
 
				+				skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
			
 
				+				comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
			
 
				+				comp.in_flight = (atomic_t)ATOMIC_INIT(1);
			
 
				+				if (do_crypt(true, req, &comp))
			
 
				+					wait_for_completion(&comp.comp);
			
 
				+
			
 
				+				r = dm_integrity_failed(ic);
			
 
				+				if (r) {
			
 
				+					*error = "Unable to generate iv";
			
 
				+					goto bad;
			
 
				+				}
			
 
				+
			
 
				+				section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
			
 
				+				if (!section_req) {
			
 
				+					*error = "Unable to allocate crypt request";
			
 
				+					r = -ENOMEM;
			
 
				+					goto bad;
			
 
				+				}
			
 
				+				section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
			
 
				+				if (!section_req->iv) {
			
 
				+					skcipher_request_free(section_req);
			
 
				+					*error = "Unable to allocate iv";
			
 
				+					r = -ENOMEM;
			
 
				+					goto bad;
			
 
				+				}
			
 
				+				memcpy(section_req->iv + ivsize, crypt_data, ivsize);
			
 
				+				section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
			
 
				+				ic->sk_requests[i] = section_req;
			
 
				+				DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < N_COMMIT_IDS; i++) {
			
 
				+		unsigned j;
			
 
				+retest_commit_id:
			
 
				+		for (j = 0; j < i; j++) {
			
 
				+			if (ic->commit_ids[j] == ic->commit_ids[i]) {
			
 
				+				ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
			
 
				+				goto retest_commit_id;
			
 
				+			}
			
 
				+		}
			
 
				+		DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
			
 
				+	}
			
 
				+
			
 
				+	journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
			
 
				+	if (journal_tree_size > ULONG_MAX) {
			
 
				+		*error = "Journal doesn't fit into memory";
			
 
				+		r = -ENOMEM;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
			
 
				+	if (!ic->journal_tree) {
			
 
				+		*error = "Could not allocate memory for journal tree";
			
 
				+		r = -ENOMEM;
			
 
				+	}
			
 
				+bad:
			
 
				+	kfree(crypt_data);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Construct a integrity mapping
			
 
				+ *
			
 
				+ * Arguments:
			
 
				+ *	device
			
 
				+ *	offset from the start of the device
			
 
				+ *	tag size
			
 
				+ *	D - direct writes, J - journal writes, R - recovery mode
			
 
				+ *	number of optional arguments
			
 
				+ *	optional arguments:
			
 
				+ *		journal_sectors
			
 
				+ *		interleave_sectors
			
 
				+ *		buffer_sectors
			
 
				+ *		journal_watermark
			
 
				+ *		commit_time
			
 
				+ *		internal_hash
			
 
				+ *		journal_crypt
			
 
				+ *		journal_mac
			
 
				+ *		block_size
			
 
				+ */
			
 
				+static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic;
			
 
				+	char dummy;
			
 
				+	int r;
			
 
				+	unsigned extra_args;
			
 
				+	struct dm_arg_set as;
			
 
				+	static struct dm_arg _args[] = {
			
 
				+		{0, 9, "Invalid number of feature args"},
			
 
				+	};
			
 
				+	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
			
 
				+	bool should_write_sb;
			
 
				+	__u64 threshold;
			
 
				+	unsigned long long start;
			
 
				+
			
 
				+#define DIRECT_ARGUMENTS	4
			
 
				+
			
 
				+	if (argc <= DIRECT_ARGUMENTS) {
			
 
				+		ti->error = "Invalid argument count";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
			
 
				+	if (!ic) {
			
 
				+		ti->error = "Cannot allocate integrity context";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	ti->private = ic;
			
 
				+	ti->per_io_data_size = sizeof(struct dm_integrity_io);
			
 
				+
			
 
				+	ic->in_progress = RB_ROOT;
			
 
				+	init_waitqueue_head(&ic->endio_wait);
			
 
				+	bio_list_init(&ic->flush_bio_list);
			
 
				+	init_waitqueue_head(&ic->copy_to_journal_wait);
			
 
				+	init_completion(&ic->crypto_backoff);
			
 
				+
			
 
				+	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
			
 
				+	if (r) {
			
 
				+		ti->error = "Device lookup failed";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
			
 
				+		ti->error = "Invalid starting offset";
			
 
				+		r = -EINVAL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	ic->start = start;
			
 
				+
			
 
				+	if (strcmp(argv[2], "-")) {
			
 
				+		if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
			
 
				+			ti->error = "Invalid tag size";
			
 
				+			r = -EINVAL;
			
 
				+			goto bad;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
			
 
				+		ic->mode = argv[3][0];
			
 
				+	else {
			
 
				+		ti->error = "Invalid mode (expecting J, D, R)";
			
 
				+		r = -EINVAL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
			
 
				+	journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
			
 
				+			ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
			
 
				+	interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
			
 
				+	buffer_sectors = DEFAULT_BUFFER_SECTORS;
			
 
				+	journal_watermark = DEFAULT_JOURNAL_WATERMARK;
			
 
				+	sync_msec = DEFAULT_SYNC_MSEC;
			
 
				+	ic->sectors_per_block = 1;
			
 
				+
			
 
				+	as.argc = argc - DIRECT_ARGUMENTS;
			
 
				+	as.argv = argv + DIRECT_ARGUMENTS;
			
 
				+	r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
			
 
				+	if (r)
			
 
				+		goto bad;
			
 
				+
			
 
				+	while (extra_args--) {
			
 
				+		const char *opt_string;
			
 
				+		unsigned val;
			
 
				+		opt_string = dm_shift_arg(&as);
			
 
				+		if (!opt_string) {
			
 
				+			r = -EINVAL;
			
 
				+			ti->error = "Not enough feature arguments";
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
			
 
				+			journal_sectors = val;
			
 
				+		else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
			
 
				+			interleave_sectors = val;
			
 
				+		else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
			
 
				+			buffer_sectors = val;
			
 
				+		else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
			
 
				+			journal_watermark = val;
			
 
				+		else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
			
 
				+			sync_msec = val;
			
 
				+		else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
			
 
				+			if (val < 1 << SECTOR_SHIFT ||
			
 
				+			    val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
			
 
				+			    (val & (val -1))) {
			
 
				+				r = -EINVAL;
			
 
				+				ti->error = "Invalid block_size argument";
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			ic->sectors_per_block = val >> SECTOR_SHIFT;
			
 
				+		} else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
			
 
				+			r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
			
 
				+					    "Invalid internal_hash argument");
			
 
				+			if (r)
			
 
				+				goto bad;
			
 
				+		} else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
			
 
				+			r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
			
 
				+					    "Invalid journal_crypt argument");
			
 
				+			if (r)
			
 
				+				goto bad;
			
 
				+		} else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
			
 
				+			r = get_alg_and_key(opt_string, &ic->journal_mac_alg,  &ti->error,
			
 
				+					    "Invalid journal_mac argument");
			
 
				+			if (r)
			
 
				+				goto bad;
			
 
				+		} else {
			
 
				+			r = -EINVAL;
			
 
				+			ti->error = "Invalid argument";
			
 
				+			goto bad;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
			
 
				+		    "Invalid internal hash", "Error setting internal hash key");
			
 
				+	if (r)
			
 
				+		goto bad;
			
 
				+
			
 
				+	r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
			
 
				+		    "Invalid journal mac", "Error setting journal mac key");
			
 
				+	if (r)
			
 
				+		goto bad;
			
 
				+
			
 
				+	if (!ic->tag_size) {
			
 
				+		if (!ic->internal_hash) {
			
 
				+			ti->error = "Unknown tag size";
			
 
				+			r = -EINVAL;
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
			
 
				+	}
			
 
				+	if (ic->tag_size > MAX_TAG_SIZE) {
			
 
				+		ti->error = "Too big tag size";
			
 
				+		r = -EINVAL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	if (!(ic->tag_size & (ic->tag_size - 1)))
			
 
				+		ic->log2_tag_size = __ffs(ic->tag_size);
			
 
				+	else
			
 
				+		ic->log2_tag_size = -1;
			
 
				+
			
 
				+	ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
			
 
				+	ic->autocommit_msec = sync_msec;
			
 
				+	setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic);
			
 
				+
			
 
				+	ic->io = dm_io_client_create();
			
 
				+	if (IS_ERR(ic->io)) {
			
 
				+		r = PTR_ERR(ic->io);
			
 
				+		ic->io = NULL;
			
 
				+		ti->error = "Cannot allocate dm io";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
			
 
				+	if (!ic->journal_io_mempool) {
			
 
				+		r = -ENOMEM;
			
 
				+		ti->error = "Cannot allocate mempool";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
			
 
				+					  WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
			
 
				+	if (!ic->metadata_wq) {
			
 
				+		ti->error = "Cannot allocate workqueue";
			
 
				+		r = -ENOMEM;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If this workqueue were percpu, it would cause bio reordering
			
 
				+	 * and reduced performance.
			
 
				+	 */
			
 
				+	ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
			
 
				+	if (!ic->wait_wq) {
			
 
				+		ti->error = "Cannot allocate workqueue";
			
 
				+		r = -ENOMEM;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
			
 
				+	if (!ic->commit_wq) {
			
 
				+		ti->error = "Cannot allocate workqueue";
			
 
				+		r = -ENOMEM;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	INIT_WORK(&ic->commit_work, integrity_commit);
			
 
				+
			
 
				+	if (ic->mode == 'J') {
			
 
				+		ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
			
 
				+		if (!ic->writer_wq) {
			
 
				+			ti->error = "Cannot allocate workqueue";
			
 
				+			r = -ENOMEM;
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		INIT_WORK(&ic->writer_work, integrity_writer);
			
 
				+	}
			
 
				+
			
 
				+	ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
			
 
				+	if (!ic->sb) {
			
 
				+		r = -ENOMEM;
			
 
				+		ti->error = "Cannot allocate superblock area";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	r = sync_rw_sb(ic, REQ_OP_READ, 0);
			
 
				+	if (r) {
			
 
				+		ti->error = "Error reading superblock";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	should_write_sb = false;
			
 
				+	if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
			
 
				+		if (ic->mode != 'R') {
			
 
				+			if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
			
 
				+				r = -EINVAL;
			
 
				+				ti->error = "The device is not initialized";
			
 
				+				goto bad;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		r = initialize_superblock(ic, journal_sectors, interleave_sectors);
			
 
				+		if (r) {
			
 
				+			ti->error = "Could not initialize superblock";
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		if (ic->mode != 'R')
			
 
				+			should_write_sb = true;
			
 
				+	}
			
 
				+
			
 
				+	if (ic->sb->version != SB_VERSION) {
			
 
				+		r = -EINVAL;
			
 
				+		ti->error = "Unknown version";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
			
 
				+		r = -EINVAL;
			
 
				+		ti->error = "Tag size doesn't match the information in superblock";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
			
 
				+		r = -EINVAL;
			
 
				+		ti->error = "Block size doesn't match the information in superblock";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	/* make sure that ti->max_io_len doesn't overflow */
			
 
				+	if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
			
 
				+	    ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
			
 
				+		r = -EINVAL;
			
 
				+		ti->error = "Invalid interleave_sectors in the superblock";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
			
 
				+	if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
			
 
				+		/* test for overflow */
			
 
				+		r = -EINVAL;
			
 
				+		ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
			
 
				+		r = -EINVAL;
			
 
				+		ti->error = "Journal mac mismatch";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	r = calculate_device_limits(ic);
			
 
				+	if (r) {
			
 
				+		ti->error = "The device is too small";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	if (!buffer_sectors)
			
 
				+		buffer_sectors = 1;
			
 
				+	ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
			
 
				+
			
 
				+	threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
			
 
				+	threshold += 50;
			
 
				+	do_div(threshold, 100);
			
 
				+	ic->free_sectors_threshold = threshold;
			
 
				+
			
 
				+	DEBUG_print("initialized:\n");
			
 
				+	DEBUG_print("	integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
			
 
				+	DEBUG_print("	journal_entry_size %u\n", ic->journal_entry_size);
			
 
				+	DEBUG_print("	journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
			
 
				+	DEBUG_print("	journal_section_entries %u\n", ic->journal_section_entries);
			
 
				+	DEBUG_print("	journal_section_sectors %u\n", ic->journal_section_sectors);
			
 
				+	DEBUG_print("	journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
			
 
				+	DEBUG_print("	journal_entries %u\n", ic->journal_entries);
			
 
				+	DEBUG_print("	log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
			
 
				+	DEBUG_print("	device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
			
 
				+	DEBUG_print("	initial_sectors 0x%x\n", ic->initial_sectors);
			
 
				+	DEBUG_print("	metadata_run 0x%x\n", ic->metadata_run);
			
 
				+	DEBUG_print("	log2_metadata_run %d\n", ic->log2_metadata_run);
			
 
				+	DEBUG_print("	provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
			
 
				+		    (unsigned long long)ic->provided_data_sectors);
			
 
				+	DEBUG_print("	log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
			
 
				+
			
 
				+	ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
			
 
				+					   1, 0, NULL, NULL);
			
 
				+	if (IS_ERR(ic->bufio)) {
			
 
				+		r = PTR_ERR(ic->bufio);
			
 
				+		ti->error = "Cannot initialize dm-bufio";
			
 
				+		ic->bufio = NULL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
			
 
				+
			
 
				+	if (ic->mode != 'R') {
			
 
				+		r = create_journal(ic, &ti->error);
			
 
				+		if (r)
			
 
				+			goto bad;
			
 
				+	}
			
 
				+
			
 
				+	if (should_write_sb) {
			
 
				+		int r;
			
 
				+
			
 
				+		init_journal(ic, 0, ic->journal_sections, 0);
			
 
				+		r = dm_integrity_failed(ic);
			
 
				+		if (unlikely(r)) {
			
 
				+			ti->error = "Error initializing journal";
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
			
 
				+		if (r) {
			
 
				+			ti->error = "Error initializing superblock";
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		ic->just_formatted = true;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
			
 
				+	if (r)
			
 
				+		goto bad;
			
 
				+
			
 
				+	if (!ic->internal_hash)
			
 
				+		dm_integrity_set(ti, ic);
			
 
				+
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->flush_supported = true;
			
 
				+
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	dm_integrity_dtr(ti);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void dm_integrity_dtr(struct dm_target *ti)
			
 
				+{
			
 
				+	struct dm_integrity_c *ic = ti->private;
			
 
				+
			
 
				+	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
			
 
				+
			
 
				+	if (ic->metadata_wq)
			
 
				+		destroy_workqueue(ic->metadata_wq);
			
 
				+	if (ic->wait_wq)
			
 
				+		destroy_workqueue(ic->wait_wq);
			
 
				+	if (ic->commit_wq)
			
 
				+		destroy_workqueue(ic->commit_wq);
			
 
				+	if (ic->writer_wq)
			
 
				+		destroy_workqueue(ic->writer_wq);
			
 
				+	if (ic->bufio)
			
 
				+		dm_bufio_client_destroy(ic->bufio);
			
 
				+	mempool_destroy(ic->journal_io_mempool);
			
 
				+	if (ic->io)
			
 
				+		dm_io_client_destroy(ic->io);
			
 
				+	if (ic->dev)
			
 
				+		dm_put_device(ti, ic->dev);
			
 
				+	dm_integrity_free_page_list(ic, ic->journal);
			
 
				+	dm_integrity_free_page_list(ic, ic->journal_io);
			
 
				+	dm_integrity_free_page_list(ic, ic->journal_xor);
			
 
				+	if (ic->journal_scatterlist)
			
 
				+		dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
			
 
				+	if (ic->journal_io_scatterlist)
			
 
				+		dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
			
 
				+	if (ic->sk_requests) {
			
 
				+		unsigned i;
			
 
				+
			
 
				+		for (i = 0; i < ic->journal_sections; i++) {
			
 
				+			struct skcipher_request *req = ic->sk_requests[i];
			
 
				+			if (req) {
			
 
				+				kzfree(req->iv);
			
 
				+				skcipher_request_free(req);
			
 
				+			}
			
 
				+		}
			
 
				+		kvfree(ic->sk_requests);
			
 
				+	}
			
 
				+	kvfree(ic->journal_tree);
			
 
				+	if (ic->sb)
			
 
				+		free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
			
 
				+
			
 
				+	if (ic->internal_hash)
			
 
				+		crypto_free_shash(ic->internal_hash);
			
 
				+	free_alg(&ic->internal_hash_alg);
			
 
				+
			
 
				+	if (ic->journal_crypt)
			
 
				+		crypto_free_skcipher(ic->journal_crypt);
			
 
				+	free_alg(&ic->journal_crypt_alg);
			
 
				+
			
 
				+	if (ic->journal_mac)
			
 
				+		crypto_free_shash(ic->journal_mac);
			
 
				+	free_alg(&ic->journal_mac_alg);
			
 
				+
			
 
				+	kfree(ic);
			
 
				+}
			
 
				+
			
 
				+static struct target_type integrity_target = {
			
 
				+	.name			= "integrity",
			
 
				+	.version		= {1, 0, 0},
			
 
				+	.module			= THIS_MODULE,
			
 
				+	.features		= DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
			
 
				+	.ctr			= dm_integrity_ctr,
			
 
				+	.dtr			= dm_integrity_dtr,
			
 
				+	.map			= dm_integrity_map,
			
 
				+	.postsuspend		= dm_integrity_postsuspend,
			
 
				+	.resume			= dm_integrity_resume,
			
 
				+	.status			= dm_integrity_status,
			
 
				+	.iterate_devices	= dm_integrity_iterate_devices,
			
 
				+	.io_hints		= dm_integrity_io_hints,
			
 
				+};
			
 
				+
			
 
				+int __init dm_integrity_init(void)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	journal_io_cache = kmem_cache_create("integrity_journal_io",
			
 
				+					     sizeof(struct journal_io), 0, 0, NULL);
			
 
				+	if (!journal_io_cache) {
			
 
				+		DMERR("can't allocate journal io cache");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_register_target(&integrity_target);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("register failed %d", r);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+void dm_integrity_exit(void)
			
 
				+{
			
 
				+	dm_unregister_target(&integrity_target);
			
 
				+	kmem_cache_destroy(journal_io_cache);
			
 
				+}
			
 
				+
			
 
				+module_init(dm_integrity_init);
			
 
				+module_exit(dm_integrity_exit);
			
 
				+
			
 
				+MODULE_AUTHOR("Milan Broz");
			
 
				+MODULE_AUTHOR("Mikulas Patocka");
			
 
				+MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -37,14 +37,6 @@ struct hash_cell {
 
				 	struct dm_table *new_map;
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * A dummy definition to make RCU happy.
			
 
				- * struct dm_table should never be dereferenced in this file.
			
 
				- */
			
 
				-struct dm_table {
			
 
				-	int undefined__;
			
 
				-};
			
 
				-
			
 
				 struct vers_iter {
			
 
				     size_t param_size;
			
 
				     struct dm_target_versions *vers, *old_vers;
			
@@ -1268,7 +1260,7 @@ static int populate_table(struct dm_table *table,
 
				 	return dm_table_complete(table);
			
 
				 }
			
 
				 
			
 
				-static bool is_valid_type(unsigned cur, unsigned new)
			
 
				+static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
			
 
				 {
			
 
				 	if (cur == new ||
			
 
				 	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
			
@@ -1778,12 +1770,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 
				 	    cmd == DM_LIST_VERSIONS_CMD)
			
 
				 		return 0;
			
 
				 
			
 
				-	if ((cmd == DM_DEV_CREATE_CMD)) {
			
 
				+	if (cmd == DM_DEV_CREATE_CMD) {
			
 
				 		if (!*param->name) {
			
 
				 			DMWARN("name not supplied when creating device");
			
 
				 			return -EINVAL;
			
 
				 		}
			
 
				-	} else if ((*param->uuid && *param->name)) {
			
 
				+	} else if (*param->uuid && *param->name) {
			
 
				 		DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
			
 
				 		return -EINVAL;
			
 
				 	}
			
@@ -1848,7 +1840,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 
				 	if (r)
			
 
				 		goto out;
			
 
				 
			
 
				-	param->data_size = sizeof(*param);
			
 
				+	param->data_size = offsetof(struct dm_ioctl, data);
			
 
				 	r = fn(param, input_param_size);
			
 
				 
			
 
				 	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
			
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -163,6 +163,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
 
				 static struct target_type linear_target = {
			
 
				 	.name   = "linear",
			
 
				 	.version = {1, 3, 0},
			
 
				+	.features = DM_TARGET_PASSES_INTEGRITY,
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = linear_ctr,
			
 
				 	.dtr    = linear_dtr,
			
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,7 +90,7 @@ struct multipath {
 
				 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
			
 
				 	atomic_t pg_init_count;		/* Number of times pg_init called */
			
 
				 
			
 
				-	unsigned queue_mode;
			
 
				+	enum dm_queue_mode queue_mode;
			
 
				 
			
 
				 	struct mutex work_mutex;
			
 
				 	struct work_struct trigger_event;
			
@@ -111,7 +111,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
				 
			
 
				 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
			
 
				 static void trigger_event(struct work_struct *work);
			
 
				-static void activate_path(struct work_struct *work);
			
 
				+static void activate_or_offline_path(struct pgpath *pgpath);
			
 
				+static void activate_path_work(struct work_struct *work);
			
 
				 static void process_queued_bios(struct work_struct *work);
			
 
				 
			
 
				 /*-----------------------------------------------
			
@@ -136,7 +137,7 @@ static struct pgpath *alloc_pgpath(void)
 
				 
			
 
				 	if (pgpath) {
			
 
				 		pgpath->is_active = true;
			
 
				-		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
			
 
				+		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
			
 
				 	}
			
 
				 
			
 
				 	return pgpath;
			
@@ -297,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m)
 
				 	struct pgpath *pgpath;
			
 
				 	unsigned long pg_init_delay = 0;
			
 
				 
			
 
				+	lockdep_assert_held(&m->lock);
			
 
				+
			
 
				 	if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
			
 
				 		return 0;
			
 
				 
			
@@ -321,13 +324,16 @@ static int __pg_init_all_paths(struct multipath *m)
 
				 	return atomic_read(&m->pg_init_in_progress);
			
 
				 }
			
 
				 
			
 
				-static void pg_init_all_paths(struct multipath *m)
			
 
				+static int pg_init_all_paths(struct multipath *m)
			
 
				 {
			
 
				+	int ret;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				-	__pg_init_all_paths(m);
			
 
				+	ret = __pg_init_all_paths(m);
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static void __switch_pg(struct multipath *m, struct priority_group *pg)
			
@@ -436,45 +442,21 @@ failed:
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Check whether bios must be queued in the device-mapper core rather
			
 
				- * than here in the target.
			
 
				- *
			
 
				- * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
			
 
				- * same value then we are not between multipath_presuspend()
			
 
				- * and multipath_resume() calls and we have no need to check
			
 
				- * for the DMF_NOFLUSH_SUSPENDING flag.
			
 
				+ * dm_report_EIO() is a macro instead of a function to make pr_debug()
			
 
				+ * report the function name and line number of the function from which
			
 
				+ * it has been invoked.
			
 
				  */
			
 
				-static bool __must_push_back(struct multipath *m)
			
 
				-{
			
 
				-	return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
			
 
				-		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
			
 
				-		dm_noflush_suspending(m->ti));
			
 
				-}
			
 
				-
			
 
				-static bool must_push_back_rq(struct multipath *m)
			
 
				-{
			
 
				-	bool r;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&m->lock, flags);
			
 
				-	r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
			
 
				-	     __must_push_back(m));
			
 
				-	spin_unlock_irqrestore(&m->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				-static bool must_push_back_bio(struct multipath *m)
			
 
				-{
			
 
				-	bool r;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	spin_lock_irqsave(&m->lock, flags);
			
 
				-	r = __must_push_back(m);
			
 
				-	spin_unlock_irqrestore(&m->lock, flags);
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				+#define dm_report_EIO(m)						\
			
 
				+({									\
			
 
				+	struct mapped_device *md = dm_table_get_md((m)->ti->table);	\
			
 
				+									\
			
 
				+	pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
			
 
				+		 dm_device_name(md),					\
			
 
				+		 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),	\
			
 
				+		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags),	\
			
 
				+		 dm_noflush_suspending((m)->ti));			\
			
 
				+	-EIO;								\
			
 
				+})
			
 
				 
			
 
				 /*
			
 
				  * Map cloned requests (request-based multipath)
			
@@ -484,11 +466,11 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
				 				   struct request **__clone)
			
 
				 {
			
 
				 	struct multipath *m = ti->private;
			
 
				-	int r = DM_MAPIO_REQUEUE;
			
 
				 	size_t nr_bytes = blk_rq_bytes(rq);
			
 
				 	struct pgpath *pgpath;
			
 
				 	struct block_device *bdev;
			
 
				 	struct dm_mpath_io *mpio = get_mpio(map_context);
			
 
				+	struct request_queue *q;
			
 
				 	struct request *clone;
			
 
				 
			
 
				 	/* Do we need to select a new pgpath? */
			
@@ -497,13 +479,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
				 		pgpath = choose_pgpath(m, nr_bytes);
			
 
				 
			
 
				 	if (!pgpath) {
			
 
				-		if (must_push_back_rq(m))
			
 
				+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
			
 
				 			return DM_MAPIO_DELAY_REQUEUE;
			
 
				-		return -EIO;	/* Failed */
			
 
				+		return dm_report_EIO(m);	/* Failed */
			
 
				 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
			
 
				 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
			
 
				-		pg_init_all_paths(m);
			
 
				-		return r;
			
 
				+		if (pg_init_all_paths(m))
			
 
				+			return DM_MAPIO_DELAY_REQUEUE;
			
 
				+		return DM_MAPIO_REQUEUE;
			
 
				 	}
			
 
				 
			
 
				 	memset(mpio, 0, sizeof(*mpio));
			
@@ -511,13 +494,19 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
				 	mpio->nr_bytes = nr_bytes;
			
 
				 
			
 
				 	bdev = pgpath->path.dev->bdev;
			
 
				-
			
 
				-	clone = blk_get_request(bdev_get_queue(bdev),
			
 
				-			rq->cmd_flags | REQ_NOMERGE,
			
 
				-			GFP_ATOMIC);
			
 
				+	q = bdev_get_queue(bdev);
			
 
				+	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
			
 
				 	if (IS_ERR(clone)) {
			
 
				 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
			
 
				-		return r;
			
 
				+		bool queue_dying = blk_queue_dying(q);
			
 
				+		DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
			
 
				+			    PTR_ERR(clone), queue_dying ? " (path offline)" : "");
			
 
				+		if (queue_dying) {
			
 
				+			atomic_inc(&m->pg_init_in_progress);
			
 
				+			activate_or_offline_path(pgpath);
			
 
				+			return DM_MAPIO_REQUEUE;
			
 
				+		}
			
 
				+		return DM_MAPIO_DELAY_REQUEUE;
			
 
				 	}
			
 
				 	clone->bio = clone->biotail = NULL;
			
 
				 	clone->rq_disk = bdev->bd_disk;
			
@@ -567,9 +556,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 
				 	}
			
 
				 
			
 
				 	if (!pgpath) {
			
 
				-		if (!must_push_back_bio(m))
			
 
				-			return -EIO;
			
 
				-		return DM_MAPIO_REQUEUE;
			
 
				+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
			
 
				+			return DM_MAPIO_REQUEUE;
			
 
				+		return dm_report_EIO(m);
			
 
				 	}
			
 
				 
			
 
				 	mpio->pgpath = pgpath;
			
@@ -640,6 +629,14 @@ static void process_queued_bios(struct work_struct *work)
 
				 	blk_finish_plug(&plug);
			
 
				 }
			
 
				 
			
 
				+static void assign_bit(bool value, long nr, unsigned long *addr)
			
 
				+{
			
 
				+	if (value)
			
 
				+		set_bit(nr, addr);
			
 
				+	else
			
 
				+		clear_bit(nr, addr);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * If we run out of usable paths, should we queue I/O or error it?
			
 
				  */
			
@@ -649,23 +646,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				-
			
 
				-	if (save_old_value) {
			
 
				-		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
			
 
				-			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-		else
			
 
				-			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-	} else {
			
 
				-		if (queue_if_no_path)
			
 
				-			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-		else
			
 
				-			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-	}
			
 
				-	if (queue_if_no_path)
			
 
				-		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-	else
			
 
				-		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-
			
 
				+	assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
			
 
				+		   (!save_old_value && queue_if_no_path),
			
 
				+		   MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
			
 
				+	assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
			
 
				+		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 
			
 
				 	if (!queue_if_no_path) {
			
@@ -1438,10 +1423,8 @@ out:
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 }
			
 
				 
			
 
				-static void activate_path(struct work_struct *work)
			
 
				+static void activate_or_offline_path(struct pgpath *pgpath)
			
 
				 {
			
 
				-	struct pgpath *pgpath =
			
 
				-		container_of(work, struct pgpath, activate_path.work);
			
 
				 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
			
 
				 
			
 
				 	if (pgpath->is_active && !blk_queue_dying(q))
			
@@ -1450,6 +1433,14 @@ static void activate_path(struct work_struct *work)
 
				 		pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
			
 
				 }
			
 
				 
			
 
				+static void activate_path_work(struct work_struct *work)
			
 
				+{
			
 
				+	struct pgpath *pgpath =
			
 
				+		container_of(work, struct pgpath, activate_path.work);
			
 
				+
			
 
				+	activate_or_offline_path(pgpath);
			
 
				+}
			
 
				+
			
 
				 static int noretry_error(int error)
			
 
				 {
			
 
				 	switch (error) {
			
@@ -1501,12 +1492,9 @@ static int do_end_io(struct multipath *m, struct request *clone,
 
				 	if (mpio->pgpath)
			
 
				 		fail_path(mpio->pgpath);
			
 
				 
			
 
				-	if (!atomic_read(&m->nr_valid_paths)) {
			
 
				-		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
			
 
				-			if (!must_push_back_rq(m))
			
 
				-				r = -EIO;
			
 
				-		}
			
 
				-	}
			
 
				+	if (atomic_read(&m->nr_valid_paths) == 0 &&
			
 
				+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
			
 
				+		r = dm_report_EIO(m);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1547,13 +1535,9 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 
				 	if (mpio->pgpath)
			
 
				 		fail_path(mpio->pgpath);
			
 
				 
			
 
				-	if (!atomic_read(&m->nr_valid_paths)) {
			
 
				-		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
			
 
				-			if (!must_push_back_bio(m))
			
 
				-				return -EIO;
			
 
				-			return DM_ENDIO_REQUEUE;
			
 
				-		}
			
 
				-	}
			
 
				+	if (atomic_read(&m->nr_valid_paths) == 0 &&
			
 
				+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
			
 
				+		return dm_report_EIO(m);
			
 
				 
			
 
				 	/* Queue for the daemon to resubmit */
			
 
				 	dm_bio_restore(get_bio_details_from_bio(clone), clone);
			
@@ -1619,10 +1603,8 @@ static void multipath_resume(struct dm_target *ti)
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				-	if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
			
 
				-		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
			
 
				-	else
			
 
				-		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
			
 
				+	assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
			
 
				+		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 }
			
 
				 
			
@@ -1682,6 +1664,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 
				 			case DM_TYPE_MQ_REQUEST_BASED:
			
 
				 				DMEMIT("queue_mode mq ");
			
 
				 				break;
			
 
				+			default:
			
 
				+				WARN_ON_ONCE(true);
			
 
				+				break;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * Copyright (C) 2010-2011 Neil Brown
			
 
				- * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
			
 
				+ * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
			
 
				  *
			
 
				  * This file is released under the GPL.
			
 
				  */
			
@@ -79,7 +79,10 @@ struct raid_dev {
 
				 #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
			
 
				 
			
 
				 /* New for v1.10.0 */
			
 
				-#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6! */
			
 
				+#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
			
 
				+
			
 
				+/* New for v1.11.1 */
			
 
				+#define __CTR_FLAG_JOURNAL_MODE		16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
			
 
				 
			
 
				 /*
			
 
				  * Flags for rs->ctr_flags field.
			
@@ -100,6 +103,7 @@ struct raid_dev {
 
				 #define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
			
 
				 #define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
			
 
				 #define CTR_FLAG_JOURNAL_DEV		(1 << __CTR_FLAG_JOURNAL_DEV)
			
 
				+#define CTR_FLAG_JOURNAL_MODE		(1 << __CTR_FLAG_JOURNAL_MODE)
			
 
				 
			
 
				 #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
			
 
				 
			
@@ -175,7 +179,8 @@ struct raid_dev {
 
				 				 CTR_FLAG_REGION_SIZE | \
			
 
				 				 CTR_FLAG_DELTA_DISKS | \
			
 
				 				 CTR_FLAG_DATA_OFFSET | \
			
 
				-				 CTR_FLAG_JOURNAL_DEV)
			
 
				+				 CTR_FLAG_JOURNAL_DEV | \
			
 
				+				 CTR_FLAG_JOURNAL_MODE)
			
 
				 
			
 
				 #define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
			
 
				 				 CTR_FLAG_REBUILD | \
			
@@ -186,7 +191,8 @@ struct raid_dev {
 
				 				 CTR_FLAG_REGION_SIZE | \
			
 
				 				 CTR_FLAG_DELTA_DISKS | \
			
 
				 				 CTR_FLAG_DATA_OFFSET | \
			
 
				-				 CTR_FLAG_JOURNAL_DEV)
			
 
				+				 CTR_FLAG_JOURNAL_DEV | \
			
 
				+				 CTR_FLAG_JOURNAL_MODE)
			
 
				 /* ...valid options definitions per raid level */
			
 
				 
			
 
				 /*
			
@@ -239,6 +245,7 @@ struct raid_set {
 
				 	struct journal_dev {
			
 
				 		struct dm_dev *dev;
			
 
				 		struct md_rdev rdev;
			
 
				+		int mode;
			
 
				 	} journal_dev;
			
 
				 
			
 
				 	struct raid_dev dev[0];
			
@@ -326,6 +333,7 @@ static struct arg_name_flag {
 
				 	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
			
 
				 	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
			
 
				 	{ CTR_FLAG_JOURNAL_DEV, "journal_dev" },
			
 
				+	{ CTR_FLAG_JOURNAL_MODE, "journal_mode" },
			
 
				 };
			
 
				 
			
 
				 /* Return argument name string for given @flag */
			
@@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
			
 
				+static struct {
			
 
				+	const int mode;
			
 
				+	const char *param;
			
 
				+} _raid456_journal_mode[] = {
			
 
				+	{ R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
			
 
				+	{ R5C_JOURNAL_MODE_WRITE_BACK    , "writeback" }
			
 
				+};
			
 
				+
			
 
				+/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
			
 
				+static int dm_raid_journal_mode_to_md(const char *mode)
			
 
				+{
			
 
				+	int m = ARRAY_SIZE(_raid456_journal_mode);
			
 
				+
			
 
				+	while (m--)
			
 
				+		if (!strcasecmp(mode, _raid456_journal_mode[m].param))
			
 
				+			return _raid456_journal_mode[m].mode;
			
 
				+
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+/* Return dm-raid raid4/5/6 journal mode string for @mode */
			
 
				+static const char *md_journal_mode_to_dm_raid(const int mode)
			
 
				+{
			
 
				+	int m = ARRAY_SIZE(_raid456_journal_mode);
			
 
				+
			
 
				+	while (m--)
			
 
				+		if (mode == _raid456_journal_mode[m].mode)
			
 
				+			return _raid456_journal_mode[m].param;
			
 
				+
			
 
				+	return "unknown";
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Bool helpers to test for various raid levels of a raid set.
			
 
				  * It's level as reported by the superblock rather than
			
@@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		/* "journal_dev dev" */
			
 
				+		/* "journal_dev <dev>" */
			
 
				 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
			
 
				 			int r;
			
 
				 			struct md_rdev *jdev;
			
@@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
				 				rs->ti->error = "No space for raid4/5/6 journal";
			
 
				 				return -ENOSPC;
			
 
				 			}
			
 
				+			rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
			
 
				 			set_bit(Journal, &jdev->flags);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				+		/* "journal_mode <mode>" ("journal_dev" mandatory!) */
			
 
				+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
			
 
				+			int r;
			
 
				+
			
 
				+			if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
			
 
				+				rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
			
 
				+				return -EINVAL;
			
 
				+			}
			
 
				+			if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
			
 
				+				rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
			
 
				+				return -EINVAL;
			
 
				+			}
			
 
				+			r = dm_raid_journal_mode_to_md(arg);
			
 
				+			if (r < 0) {
			
 
				+				rs->ti->error = "Invalid 'journal_mode' argument";
			
 
				+				return r;
			
 
				+			}
			
 
				+			rs->journal_dev.mode = r;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				 		/*
			
 
				 		 * Parameters with number values from here on.
			
 
				 		 */
			
@@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	rs->callbacks.congested_fn = raid_is_congested;
			
 
				 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
			
 
				 
			
 
				+	/* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
			
 
				+	if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
			
 
				+		r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
			
 
				+		if (r) {
			
 
				+			ti->error = "Failed to set raid4/5/6 journal mode";
			
 
				+			mddev_unlock(&rs->md);
			
 
				+			goto bad_journal_mode_set;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	mddev_suspend(&rs->md);
			
 
				 
			
 
				 	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
			
@@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	mddev_unlock(&rs->md);
			
 
				 	return 0;
			
 
				 
			
 
				+bad_journal_mode_set:
			
 
				 bad_stripe_cache:
			
 
				 bad_check_reshape:
			
 
				 	md_stop(&rs->md);
			
@@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev)
 
				  * Status characters:
			
 
				  *
			
 
				  *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
			
 
				- *  'a' = Alive but not in-sync
			
 
				- *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
			
 
				+ *  'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
			
 
				+ *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
			
 
				  *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
			
 
				  */
			
 
				-static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
			
 
				+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
			
 
				 {
			
 
				 	if (!rdev->bdev)
			
 
				 		return "-";
			
 
				 	else if (test_bit(Faulty, &rdev->flags))
			
 
				 		return "D";
			
 
				 	else if (test_bit(Journal, &rdev->flags))
			
 
				-		return "A";
			
 
				+		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
			
 
				 	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
			
 
				 		return "a";
			
 
				 	else
			
@@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 
			
 
				 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
			
 
				 		for (i = 0; i < rs->raid_disks; i++)
			
 
				-			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
			
 
				+			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
			
 
				 
			
 
				 		/*
			
 
				 		 * In-sync/Reshape ratio:
			
@@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 		 * v1.10.0+:
			
 
				 		 */
			
 
				 		DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
			
 
				-			      __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
			
 
				+			      __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
			
 
				 		break;
			
 
				 
			
 
				 	case STATUSTYPE_TABLE:
			
@@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 				  write_mostly_params +
			
 
				 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
			
 
				 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
			
 
				-				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
			
 
				+				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
			
 
				+				  (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
			
 
				+
			
 
				 		/* Emit table line */
			
 
				+		/* This has to be in the documented order for userspace! */
			
 
				 		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
			
 
				-		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
			
 
				-					 raid10_md_layout_to_format(mddev->layout));
			
 
				-		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
			
 
				-					 raid10_md_layout_to_copies(mddev->layout));
			
 
				-		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
			
 
				 		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
			
 
				 			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
			
 
				-		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
			
 
				-					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
			
 
				-		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
			
 
				-					   (unsigned long long) rs->data_offset);
			
 
				-		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
			
 
				-					  mddev->bitmap_info.daemon_sleep);
			
 
				-		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
			
 
				-					 max(rs->delta_disks, mddev->delta_disks));
			
 
				-		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
			
 
				-					 max_nr_stripes);
			
 
				+		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
			
 
				 		if (rebuild_disks)
			
 
				 			for (i = 0; i < rs->raid_disks; i++)
			
 
				 				if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
			
 
				 					DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
			
 
				 							 rs->dev[i].rdev.raid_disk);
			
 
				+		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
			
 
				+					  mddev->bitmap_info.daemon_sleep);
			
 
				+		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
			
 
				+					 mddev->sync_speed_min);
			
 
				+		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
			
 
				+					 mddev->sync_speed_max);
			
 
				 		if (write_mostly_params)
			
 
				 			for (i = 0; i < rs->raid_disks; i++)
			
 
				 				if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
			
@@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
				 		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
			
 
				 			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
			
 
				 					  mddev->bitmap_info.max_write_behind);
			
 
				-		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
			
 
				-					 mddev->sync_speed_max);
			
 
				-		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
			
 
				-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
			
 
				-					 mddev->sync_speed_min);
			
 
				+		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
			
 
				+					 max_nr_stripes);
			
 
				+		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
			
 
				+					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
			
 
				+		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
			
 
				+					 raid10_md_layout_to_copies(mddev->layout));
			
 
				+		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
			
 
				+					 raid10_md_layout_to_format(mddev->layout));
			
 
				+		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
			
 
				+					 max(rs->delta_disks, mddev->delta_disks));
			
 
				+		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
			
 
				+					   (unsigned long long) rs->data_offset);
			
 
				 		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
			
 
				 			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
			
 
				 					__get_dev_name(rs->journal_dev.dev));
			
 
				+		if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
			
 
				+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
			
 
				+					 md_journal_mode_to_dm_raid(rs->journal_dev.mode));
			
 
				 		DMEMIT(" %d", rs->raid_disks);
			
 
				 		for (i = 0; i < rs->raid_disks; i++)
			
 
				 			DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
			
@@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti)
 
				 
			
 
				 static struct target_type raid_target = {
			
 
				 	.name = "raid",
			
 
				-	.version = {1, 10, 1},
			
 
				+	.version = {1, 11, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = raid_ctr,
			
 
				 	.dtr = raid_dtr,
			
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -280,7 +280,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 
				 	if (!rq->q->mq_ops)
			
 
				 		dm_old_requeue_request(rq);
			
 
				 	else
			
 
				-		dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
			
 
				+		dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);
			
 
				 
			
 
				 	rq_completed(md, rw, false);
			
 
				 }
			
@@ -815,10 +815,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 
				 	dm_init_md_queue(md);
			
 
				 
			
 
				 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
			
 
				-	blk_mq_register_dev(disk_to_dev(md->disk), q);
			
 
				+	err = blk_mq_register_dev(disk_to_dev(md->disk), q);
			
 
				+	if (err)
			
 
				+		goto out_cleanup_queue;
			
 
				 
			
 
				 	return 0;
			
 
				 
			
 
				+out_cleanup_queue:
			
 
				+	blk_cleanup_queue(q);
			
 
				 out_tag_set:
			
 
				 	blk_mq_free_tag_set(md->tag_set);
			
 
				 out_kfree_tag_set:
			
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -442,6 +442,7 @@ static void stripe_io_hints(struct dm_target *ti,
 
				 static struct target_type stripe_target = {
			
 
				 	.name   = "striped",
			
 
				 	.version = {1, 6, 0},
			
 
				+	.features = DM_TARGET_PASSES_INTEGRITY,
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = stripe_ctr,
			
 
				 	.dtr    = stripe_dtr,
			
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -30,7 +30,7 @@
 
				 
			
 
				 struct dm_table {
			
 
				 	struct mapped_device *md;
			
 
				-	unsigned type;
			
 
				+	enum dm_queue_mode type;
			
 
				 
			
 
				 	/* btree table */
			
 
				 	unsigned int depth;
			
@@ -47,6 +47,7 @@ struct dm_table {
 
				 	bool integrity_supported:1;
			
 
				 	bool singleton:1;
			
 
				 	bool all_blk_mq:1;
			
 
				+	unsigned integrity_added:1;
			
 
				 
			
 
				 	/*
			
 
				 	 * Indicates the rw permissions for the new logical
			
@@ -372,7 +373,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 
				  */
			
 
				 dev_t dm_get_dev_t(const char *path)
			
 
				 {
			
 
				-	dev_t uninitialized_var(dev);
			
 
				+	dev_t dev;
			
 
				 	struct block_device *bdev;
			
 
				 
			
 
				 	bdev = lookup_bdev(path);
			
@@ -626,13 +627,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 
				 
			
 
				 	struct dm_target *uninitialized_var(ti);
			
 
				 	struct queue_limits ti_limits;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	/*
			
 
				 	 * Check each entry in the table in turn.
			
 
				 	 */
			
 
				-	while (i < dm_table_get_num_targets(table)) {
			
 
				-		ti = dm_table_get_target(table, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
			
 
				+		ti = dm_table_get_target(table, i);
			
 
				 
			
 
				 		blk_set_stacking_limits(&ti_limits);
			
 
				 
			
@@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
				 		t->immutable_target_type = tgt->type;
			
 
				 	}
			
 
				 
			
 
				+	if (dm_target_has_integrity(tgt->type))
			
 
				+		t->integrity_added = 1;
			
 
				+
			
 
				 	tgt->table = t;
			
 
				 	tgt->begin = start;
			
 
				 	tgt->len = len;
			
@@ -821,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 
				 }
			
 
				 EXPORT_SYMBOL(dm_consume_args);
			
 
				 
			
 
				-static bool __table_type_bio_based(unsigned table_type)
			
 
				+static bool __table_type_bio_based(enum dm_queue_mode table_type)
			
 
				 {
			
 
				 	return (table_type == DM_TYPE_BIO_BASED ||
			
 
				 		table_type == DM_TYPE_DAX_BIO_BASED);
			
 
				 }
			
 
				 
			
 
				-static bool __table_type_request_based(unsigned table_type)
			
 
				+static bool __table_type_request_based(enum dm_queue_mode table_type)
			
 
				 {
			
 
				 	return (table_type == DM_TYPE_REQUEST_BASED ||
			
 
				 		table_type == DM_TYPE_MQ_REQUEST_BASED);
			
 
				 }
			
 
				 
			
 
				-void dm_table_set_type(struct dm_table *t, unsigned type)
			
 
				+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
			
 
				 {
			
 
				 	t->type = type;
			
 
				 }
			
@@ -850,11 +854,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
 
				 static bool dm_table_supports_dax(struct dm_table *t)
			
 
				 {
			
 
				 	struct dm_target *ti;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	/* Ensure that all targets support DAX. */
			
 
				-	while (i < dm_table_get_num_targets(t)) {
			
 
				-		ti = dm_table_get_target(t, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		ti = dm_table_get_target(t, i);
			
 
				 
			
 
				 		if (!ti->type->direct_access)
			
 
				 			return false;
			
@@ -875,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t)
 
				 	struct dm_target *tgt;
			
 
				 	struct dm_dev_internal *dd;
			
 
				 	struct list_head *devices = dm_table_get_devices(t);
			
 
				-	unsigned live_md_type = dm_get_md_type(t->md);
			
 
				+	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
			
 
				 
			
 
				 	if (t->type != DM_TYPE_NONE) {
			
 
				 		/* target already set the table's type */
			
@@ -984,7 +988,7 @@ verify_rq_based:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-unsigned dm_table_get_type(struct dm_table *t)
			
 
				+enum dm_queue_mode dm_table_get_type(struct dm_table *t)
			
 
				 {
			
 
				 	return t->type;
			
 
				 }
			
@@ -1006,11 +1010,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
 
				 
			
 
				 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
			
 
				 {
			
 
				-	struct dm_target *uninitialized_var(ti);
			
 
				-	unsigned i = 0;
			
 
				+	struct dm_target *ti;
			
 
				+	unsigned i;
			
 
				 
			
 
				-	while (i < dm_table_get_num_targets(t)) {
			
 
				-		ti = dm_table_get_target(t, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		ti = dm_table_get_target(t, i);
			
 
				 		if (dm_target_is_wildcard(ti->type))
			
 
				 			return ti;
			
 
				 	}
			
@@ -1035,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t)
 
				 
			
 
				 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
			
 
				 {
			
 
				-	unsigned type = dm_table_get_type(t);
			
 
				+	enum dm_queue_mode type = dm_table_get_type(t);
			
 
				 	unsigned per_io_data_size = 0;
			
 
				 	struct dm_target *tgt;
			
 
				 	unsigned i;
			
@@ -1131,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
 
				 	struct list_head *devices = dm_table_get_devices(t);
			
 
				 	struct dm_dev_internal *dd = NULL;
			
 
				 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		struct dm_target *ti = dm_table_get_target(t, i);
			
 
				+		if (!dm_target_passes_integrity(ti->type))
			
 
				+			goto no_integrity;
			
 
				+	}
			
 
				 
			
 
				 	list_for_each_entry(dd, devices, list) {
			
 
				 		template_disk = dd->dm_dev->bdev->bd_disk;
			
@@ -1168,6 +1179,10 @@ static int dm_table_register_integrity(struct dm_table *t)
 
				 	struct mapped_device *md = t->md;
			
 
				 	struct gendisk *template_disk = NULL;
			
 
				 
			
 
				+	/* If target handles integrity itself do not register it here. */
			
 
				+	if (t->integrity_added)
			
 
				+		return 0;
			
 
				+
			
 
				 	template_disk = dm_table_get_integrity_disk(t);
			
 
				 	if (!template_disk)
			
 
				 		return 0;
			
@@ -1313,15 +1328,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
 
				  */
			
 
				 bool dm_table_has_no_data_devices(struct dm_table *table)
			
 
				 {
			
 
				-	struct dm_target *uninitialized_var(ti);
			
 
				-	unsigned i = 0, num_devices = 0;
			
 
				+	struct dm_target *ti;
			
 
				+	unsigned i, num_devices;
			
 
				 
			
 
				-	while (i < dm_table_get_num_targets(table)) {
			
 
				-		ti = dm_table_get_target(table, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
			
 
				+		ti = dm_table_get_target(table, i);
			
 
				 
			
 
				 		if (!ti->type->iterate_devices)
			
 
				 			return false;
			
 
				 
			
 
				+		num_devices = 0;
			
 
				 		ti->type->iterate_devices(ti, count_device, &num_devices);
			
 
				 		if (num_devices)
			
 
				 			return false;
			
@@ -1336,16 +1352,16 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
 
				 int dm_calculate_queue_limits(struct dm_table *table,
			
 
				 			      struct queue_limits *limits)
			
 
				 {
			
 
				-	struct dm_target *uninitialized_var(ti);
			
 
				+	struct dm_target *ti;
			
 
				 	struct queue_limits ti_limits;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	blk_set_stacking_limits(limits);
			
 
				 
			
 
				-	while (i < dm_table_get_num_targets(table)) {
			
 
				+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
			
 
				 		blk_set_stacking_limits(&ti_limits);
			
 
				 
			
 
				-		ti = dm_table_get_target(table, i++);
			
 
				+		ti = dm_table_get_target(table, i);
			
 
				 
			
 
				 		if (!ti->type->iterate_devices)
			
 
				 			goto combine_limits;
			
@@ -1394,6 +1410,9 @@ static void dm_table_verify_integrity(struct dm_table *t)
 
				 {
			
 
				 	struct gendisk *template_disk = NULL;
			
 
				 
			
 
				+	if (t->integrity_added)
			
 
				+		return;
			
 
				+
			
 
				 	if (t->integrity_supported) {
			
 
				 		/*
			
 
				 		 * Verify that the original integrity profile
			
@@ -1424,7 +1443,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
 
				 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
			
 
				 {
			
 
				 	struct dm_target *ti;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	/*
			
 
				 	 * Require at least one underlying device to support flushes.
			
@@ -1432,8 +1451,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 
				 	 * so we need to use iterate_devices here, which targets
			
 
				 	 * supporting flushes must provide.
			
 
				 	 */
			
 
				-	while (i < dm_table_get_num_targets(t)) {
			
 
				-		ti = dm_table_get_target(t, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		ti = dm_table_get_target(t, i);
			
 
				 
			
 
				 		if (!ti->num_flush_bios)
			
 
				 			continue;
			
@@ -1477,10 +1496,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 
				 					   iterate_devices_callout_fn func)
			
 
				 {
			
 
				 	struct dm_target *ti;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				-	while (i < dm_table_get_num_targets(t)) {
			
 
				-		ti = dm_table_get_target(t, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		ti = dm_table_get_target(t, i);
			
 
				 
			
 
				 		if (!ti->type->iterate_devices ||
			
 
				 		    !ti->type->iterate_devices(ti, func, NULL))
			
@@ -1501,10 +1520,10 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de
 
				 static bool dm_table_supports_write_same(struct dm_table *t)
			
 
				 {
			
 
				 	struct dm_target *ti;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				-	while (i < dm_table_get_num_targets(t)) {
			
 
				-		ti = dm_table_get_target(t, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		ti = dm_table_get_target(t, i);
			
 
				 
			
 
				 		if (!ti->num_write_same_bios)
			
 
				 			return false;
			
@@ -1556,7 +1575,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 
				 static bool dm_table_supports_discards(struct dm_table *t)
			
 
				 {
			
 
				 	struct dm_target *ti;
			
 
				-	unsigned i = 0;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	/*
			
 
				 	 * Unless any target used by the table set discards_supported,
			
@@ -1565,8 +1584,8 @@ static bool dm_table_supports_discards(struct dm_table *t)
 
				 	 * so we need to use iterate_devices here, which targets
			
 
				 	 * supporting discard selectively must provide.
			
 
				 	 */
			
 
				-	while (i < dm_table_get_num_targets(t)) {
			
 
				-		ti = dm_table_get_target(t, i++);
			
 
				+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
			
 
				+		ti = dm_table_get_target(t, i);
			
 
				 
			
 
				 		if (!ti->num_discard_bios)
			
 
				 			continue;
			
@@ -1672,6 +1691,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
 
				 	int i = t->num_targets;
			
 
				 	struct dm_target *ti = t->targets;
			
 
				 
			
 
				+	lockdep_assert_held(&t->md->suspend_lock);
			
 
				+
			
 
				 	while (i--) {
			
 
				 		switch (mode) {
			
 
				 		case PRESUSPEND:
			
@@ -1719,6 +1740,8 @@ int dm_table_resume_targets(struct dm_table *t)
 
				 {
			
 
				 	int i, r = 0;
			
 
				 
			
 
				+	lockdep_assert_held(&t->md->suspend_lock);
			
 
				+
			
 
				 	for (i = 0; i < t->num_targets; i++) {
			
 
				 		struct dm_target *ti = t->targets + i;
			
 
				 
			
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -77,7 +77,6 @@
 
				 #define THIN_SUPERBLOCK_MAGIC 27022010
			
 
				 #define THIN_SUPERBLOCK_LOCATION 0
			
 
				 #define THIN_VERSION 2
			
 
				-#define THIN_METADATA_CACHE_SIZE 64
			
 
				 #define SECTOR_TO_BLOCK_SHIFT 3
			
 
				 
			
 
				 /*
			
@@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 
				 	int r;
			
 
				 
			
 
				 	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
			
 
				-					  THIN_METADATA_CACHE_SIZE,
			
 
				 					  THIN_MAX_CONCURRENT_LOCKS);
			
 
				 	if (IS_ERR(pmd->bm)) {
			
 
				 		DMERR("could not create block manager");
			
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -5,7 +5,7 @@
 
				  */
			
 
				 
			
 
				 #include "dm-thin-metadata.h"
			
 
				-#include "dm-bio-prison.h"
			
 
				+#include "dm-bio-prison-v1.h"
			
 
				 #include "dm.h"
			
 
				 
			
 
				 #include <linux/device-mapper.h>
			
@@ -1069,6 +1069,7 @@ static void passdown_endio(struct bio *bio)
 
				 	 * to unmap (we ignore err).
			
 
				 	 */
			
 
				 	queue_passdown_pt2(bio->bi_private);
			
 
				+	bio_put(bio);
			
 
				 }
			
 
				 
			
 
				 static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
			
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -188,7 +188,7 @@ error:
 
				 static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
			
 
				 			  u8 *want_digest, u8 *data)
			
 
				 {
			
 
				-	if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
			
 
				+	if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
			
 
				 				 data, 1 << v->data_dev_block_bits,
			
 
				 				 verity_io_real_digest(v, io))))
			
 
				 		return 0;
			
@@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 
				 	}
			
 
				 
			
 
				 	/* Always re-validate the corrected block against the expected hash */
			
 
				-	r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
			
 
				+	r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
			
 
				 			1 << v->data_dev_block_bits,
			
 
				 			verity_io_real_digest(v, io));
			
 
				 	if (unlikely(r < 0))
			
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Wrapper for crypto_shash_init, which handles verity salting.
			
 
				+ * Callback function for asynchrnous crypto API completion notification
			
 
				  */
			
 
				-static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
			
 
				+static void verity_op_done(struct crypto_async_request *base, int err)
			
 
				 {
			
 
				-	int r;
			
 
				+	struct verity_result *res = (struct verity_result *)base->data;
			
 
				 
			
 
				-	desc->tfm = v->tfm;
			
 
				-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	if (err == -EINPROGRESS)
			
 
				+		return;
			
 
				 
			
 
				-	r = crypto_shash_init(desc);
			
 
				+	res->err = err;
			
 
				+	complete(&res->completion);
			
 
				+}
			
 
				 
			
 
				-	if (unlikely(r < 0)) {
			
 
				-		DMERR("crypto_shash_init failed: %d", r);
			
 
				-		return r;
			
 
				-	}
			
 
				+/*
			
 
				+ * Wait for async crypto API callback
			
 
				+ */
			
 
				+static inline int verity_complete_op(struct verity_result *res, int ret)
			
 
				+{
			
 
				+	switch (ret) {
			
 
				+	case 0:
			
 
				+		break;
			
 
				 
			
 
				-	if (likely(v->version >= 1)) {
			
 
				-		r = crypto_shash_update(desc, v->salt, v->salt_size);
			
 
				+	case -EINPROGRESS:
			
 
				+	case -EBUSY:
			
 
				+		ret = wait_for_completion_interruptible(&res->completion);
			
 
				+		if (!ret)
			
 
				+			ret = res->err;
			
 
				+		reinit_completion(&res->completion);
			
 
				+		break;
			
 
				 
			
 
				-		if (unlikely(r < 0)) {
			
 
				-			DMERR("crypto_shash_update failed: %d", r);
			
 
				-			return r;
			
 
				-		}
			
 
				+	default:
			
 
				+		DMERR("verity_wait_hash: crypto op submission failed: %d", ret);
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+	if (unlikely(ret < 0))
			
 
				+		DMERR("verity_wait_hash: crypto op failed: %d", ret);
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				-static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
			
 
				-			      const u8 *data, size_t len)
			
 
				+static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
			
 
				+				const u8 *data, size_t len,
			
 
				+				struct verity_result *res)
			
 
				 {
			
 
				-	int r = crypto_shash_update(desc, data, len);
			
 
				+	struct scatterlist sg;
			
 
				 
			
 
				-	if (unlikely(r < 0))
			
 
				-		DMERR("crypto_shash_update failed: %d", r);
			
 
				+	sg_init_one(&sg, data, len);
			
 
				+	ahash_request_set_crypt(req, &sg, NULL, len);
			
 
				+
			
 
				+	return verity_complete_op(res, crypto_ahash_update(req));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wrapper for crypto_ahash_init, which handles verity salting.
			
 
				+ */
			
 
				+static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
			
 
				+				struct verity_result *res)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	ahash_request_set_tfm(req, v->tfm);
			
 
				+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
			
 
				+					CRYPTO_TFM_REQ_MAY_BACKLOG,
			
 
				+					verity_op_done, (void *)res);
			
 
				+	init_completion(&res->completion);
			
 
				+
			
 
				+	r = verity_complete_op(res, crypto_ahash_init(req));
			
 
				+
			
 
				+	if (unlikely(r < 0)) {
			
 
				+		DMERR("crypto_ahash_init failed: %d", r);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	if (likely(v->version >= 1))
			
 
				+		r = verity_hash_update(v, req, v->salt, v->salt_size, res);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
			
 
				-			     u8 *digest)
			
 
				+static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
			
 
				+			     u8 *digest, struct verity_result *res)
			
 
				 {
			
 
				 	int r;
			
 
				 
			
 
				 	if (unlikely(!v->version)) {
			
 
				-		r = crypto_shash_update(desc, v->salt, v->salt_size);
			
 
				+		r = verity_hash_update(v, req, v->salt, v->salt_size, res);
			
 
				 
			
 
				 		if (r < 0) {
			
 
				-			DMERR("crypto_shash_update failed: %d", r);
			
 
				-			return r;
			
 
				+			DMERR("verity_hash_final failed updating salt: %d", r);
			
 
				+			goto out;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	r = crypto_shash_final(desc, digest);
			
 
				-
			
 
				-	if (unlikely(r < 0))
			
 
				-		DMERR("crypto_shash_final failed: %d", r);
			
 
				-
			
 
				+	ahash_request_set_crypt(req, NULL, digest, 0);
			
 
				+	r = verity_complete_op(res, crypto_ahash_final(req));
			
 
				+out:
			
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-int verity_hash(struct dm_verity *v, struct shash_desc *desc,
			
 
				+int verity_hash(struct dm_verity *v, struct ahash_request *req,
			
 
				 		const u8 *data, size_t len, u8 *digest)
			
 
				 {
			
 
				 	int r;
			
 
				+	struct verity_result res;
			
 
				 
			
 
				-	r = verity_hash_init(v, desc);
			
 
				+	r = verity_hash_init(v, req, &res);
			
 
				 	if (unlikely(r < 0))
			
 
				-		return r;
			
 
				+		goto out;
			
 
				 
			
 
				-	r = verity_hash_update(v, desc, data, len);
			
 
				+	r = verity_hash_update(v, req, data, len, &res);
			
 
				 	if (unlikely(r < 0))
			
 
				-		return r;
			
 
				+		goto out;
			
 
				+
			
 
				+	r = verity_hash_final(v, req, digest, &res);
			
 
				 
			
 
				-	return verity_hash_final(v, desc, digest);
			
 
				+out:
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				 static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
			
@@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 
				 			goto release_ret_r;
			
 
				 		}
			
 
				 
			
 
				-		r = verity_hash(v, verity_io_hash_desc(v, io),
			
 
				+		r = verity_hash(v, verity_io_hash_req(v, io),
			
 
				 				data, 1 << v->hash_dev_block_bits,
			
 
				 				verity_io_real_digest(v, io));
			
 
				 		if (unlikely(r < 0))
			
@@ -343,6 +385,49 @@ out:
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Calculates the digest for the given bio
			
 
				+ */
			
 
				+int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
			
 
				+			struct bvec_iter *iter, struct verity_result *res)
			
 
				+{
			
 
				+	unsigned int todo = 1 << v->data_dev_block_bits;
			
 
				+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
			
 
				+	struct scatterlist sg;
			
 
				+	struct ahash_request *req = verity_io_hash_req(v, io);
			
 
				+
			
 
				+	do {
			
 
				+		int r;
			
 
				+		unsigned int len;
			
 
				+		struct bio_vec bv = bio_iter_iovec(bio, *iter);
			
 
				+
			
 
				+		sg_init_table(&sg, 1);
			
 
				+
			
 
				+		len = bv.bv_len;
			
 
				+
			
 
				+		if (likely(len >= todo))
			
 
				+			len = todo;
			
 
				+		/*
			
 
				+		 * Operating on a single page at a time looks suboptimal
			
 
				+		 * until you consider the typical block size is 4,096B.
			
 
				+		 * Going through this loops twice should be very rare.
			
 
				+		 */
			
 
				+		sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
			
 
				+		ahash_request_set_crypt(req, &sg, NULL, len);
			
 
				+		r = verity_complete_op(res, crypto_ahash_update(req));
			
 
				+
			
 
				+		if (unlikely(r < 0)) {
			
 
				+			DMERR("verity_for_io_block crypto op failed: %d", r);
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+		bio_advance_iter(bio, iter, len);
			
 
				+		todo -= len;
			
 
				+	} while (todo);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
			
 
				  * starting from iter.
			
@@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
			
 
				-				 u8 *data, size_t len)
			
 
				-{
			
 
				-	return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
			
 
				-}
			
 
				-
			
 
				 static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
			
 
				 			  u8 *data, size_t len)
			
 
				 {
			
@@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io)
 
				 	struct dm_verity *v = io->v;
			
 
				 	struct bvec_iter start;
			
 
				 	unsigned b;
			
 
				+	struct verity_result res;
			
 
				 
			
 
				 	for (b = 0; b < io->n_blocks; b++) {
			
 
				 		int r;
			
 
				-		struct shash_desc *desc = verity_io_hash_desc(v, io);
			
 
				+		struct ahash_request *req = verity_io_hash_req(v, io);
			
 
				 
			
 
				 		r = verity_hash_for_block(v, io, io->block + b,
			
 
				 					  verity_io_want_digest(v, io),
			
@@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io)
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		r = verity_hash_init(v, desc);
			
 
				+		r = verity_hash_init(v, req, &res);
			
 
				 		if (unlikely(r < 0))
			
 
				 			return r;
			
 
				 
			
 
				 		start = io->iter;
			
 
				-		r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
			
 
				+		r = verity_for_io_block(v, io, &io->iter, &res);
			
 
				 		if (unlikely(r < 0))
			
 
				 			return r;
			
 
				 
			
 
				-		r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
			
 
				+		r = verity_hash_final(v, req, verity_io_real_digest(v, io),
			
 
				+					&res);
			
 
				 		if (unlikely(r < 0))
			
 
				 			return r;
			
 
				 
			
@@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti)
 
				 	kfree(v->zero_digest);
			
 
				 
			
 
				 	if (v->tfm)
			
 
				-		crypto_free_shash(v->tfm);
			
 
				+		crypto_free_ahash(v->tfm);
			
 
				 
			
 
				 	kfree(v->alg_name);
			
 
				 
			
@@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti)
 
				 static int verity_alloc_zero_digest(struct dm_verity *v)
			
 
				 {
			
 
				 	int r = -ENOMEM;
			
 
				-	struct shash_desc *desc;
			
 
				+	struct ahash_request *req;
			
 
				 	u8 *zero_data;
			
 
				 
			
 
				 	v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
			
@@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 
				 	if (!v->zero_digest)
			
 
				 		return r;
			
 
				 
			
 
				-	desc = kmalloc(v->shash_descsize, GFP_KERNEL);
			
 
				+	req = kmalloc(v->ahash_reqsize, GFP_KERNEL);
			
 
				 
			
 
				-	if (!desc)
			
 
				+	if (!req)
			
 
				 		return r; /* verity_dtr will free zero_digest */
			
 
				 
			
 
				 	zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
			
@@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 
				 	if (!zero_data)
			
 
				 		goto out;
			
 
				 
			
 
				-	r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
			
 
				+	r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
			
 
				 			v->zero_digest);
			
 
				 
			
 
				 out:
			
 
				-	kfree(desc);
			
 
				+	kfree(req);
			
 
				 	kfree(zero_data);
			
 
				 
			
 
				 	return r;
			
@@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
			
 
				+	v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0);
			
 
				 	if (IS_ERR(v->tfm)) {
			
 
				 		ti->error = "Cannot initialize hash function";
			
 
				 		r = PTR_ERR(v->tfm);
			
 
				 		v->tfm = NULL;
			
 
				 		goto bad;
			
 
				 	}
			
 
				-	v->digest_size = crypto_shash_digestsize(v->tfm);
			
 
				+	v->digest_size = crypto_ahash_digestsize(v->tfm);
			
 
				 	if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
			
 
				 		ti->error = "Digest size too big";
			
 
				 		r = -EINVAL;
			
 
				 		goto bad;
			
 
				 	}
			
 
				-	v->shash_descsize =
			
 
				-		sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
			
 
				+	v->ahash_reqsize = sizeof(struct ahash_request) +
			
 
				+		crypto_ahash_reqsize(v->tfm);
			
 
				 
			
 
				 	v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
			
 
				 	if (!v->root_digest) {
			
@@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	ti->per_io_data_size = sizeof(struct dm_verity_io) +
			
 
				-				v->shash_descsize + v->digest_size * 2;
			
 
				+				v->ahash_reqsize + v->digest_size * 2;
			
 
				 
			
 
				 	r = verity_fec_ctr(v);
			
 
				 	if (r)
			
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -37,7 +37,7 @@ struct dm_verity {
 
				 	struct dm_target *ti;
			
 
				 	struct dm_bufio_client *bufio;
			
 
				 	char *alg_name;
			
 
				-	struct crypto_shash *tfm;
			
 
				+	struct crypto_ahash *tfm;
			
 
				 	u8 *root_digest;	/* digest of the root block */
			
 
				 	u8 *salt;		/* salt: its size is salt_size */
			
 
				 	u8 *zero_digest;	/* digest for a zero block */
			
@@ -52,7 +52,7 @@ struct dm_verity {
 
				 	unsigned char levels;	/* the number of tree levels */
			
 
				 	unsigned char version;
			
 
				 	unsigned digest_size;	/* digest size for the current hash algorithm */
			
 
				-	unsigned shash_descsize;/* the size of temporary space for crypto */
			
 
				+	unsigned int ahash_reqsize;/* the size of temporary space for crypto */
			
 
				 	int hash_failed;	/* set to 1 if hash of any block failed */
			
 
				 	enum verity_mode mode;	/* mode for handling verification errors */
			
 
				 	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
			
@@ -81,31 +81,36 @@ struct dm_verity_io {
 
				 	/*
			
 
				 	 * Three variably-size fields follow this struct:
			
 
				 	 *
			
 
				-	 * u8 hash_desc[v->shash_descsize];
			
 
				+	 * u8 hash_req[v->ahash_reqsize];
			
 
				 	 * u8 real_digest[v->digest_size];
			
 
				 	 * u8 want_digest[v->digest_size];
			
 
				 	 *
			
 
				-	 * To access them use: verity_io_hash_desc(), verity_io_real_digest()
			
 
				+	 * To access them use: verity_io_hash_req(), verity_io_real_digest()
			
 
				 	 * and verity_io_want_digest().
			
 
				 	 */
			
 
				 };
			
 
				 
			
 
				-static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
			
 
				+struct verity_result {
			
 
				+	struct completion completion;
			
 
				+	int err;
			
 
				+};
			
 
				+
			
 
				+static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
			
 
				 						     struct dm_verity_io *io)
			
 
				 {
			
 
				-	return (struct shash_desc *)(io + 1);
			
 
				+	return (struct ahash_request *)(io + 1);
			
 
				 }
			
 
				 
			
 
				 static inline u8 *verity_io_real_digest(struct dm_verity *v,
			
 
				 					struct dm_verity_io *io)
			
 
				 {
			
 
				-	return (u8 *)(io + 1) + v->shash_descsize;
			
 
				+	return (u8 *)(io + 1) + v->ahash_reqsize;
			
 
				 }
			
 
				 
			
 
				 static inline u8 *verity_io_want_digest(struct dm_verity *v,
			
 
				 					struct dm_verity_io *io)
			
 
				 {
			
 
				-	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
			
 
				+	return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
			
 
				 }
			
 
				 
			
 
				 static inline u8 *verity_io_digest_end(struct dm_verity *v,
			
@@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 
				 					      struct dm_verity_io *io,
			
 
				 					      u8 *data, size_t len));
			
 
				 
			
 
				-extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
			
 
				+extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
			
 
				 		       const u8 *data, size_t len, u8 *digest);
			
 
				 
			
 
				 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
			
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1104,8 +1104,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 
				 
			
 
				 	__bio_clone_fast(clone, bio);
			
 
				 
			
 
				-	if (bio_integrity(bio)) {
			
 
				-		int r = bio_integrity_clone(clone, bio, GFP_NOIO);
			
 
				+	if (unlikely(bio_integrity(bio) != NULL)) {
			
 
				+		int r;
			
 
				+
			
 
				+		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
			
 
				+			     !dm_target_passes_integrity(tio->ti->type))) {
			
 
				+			DMWARN("%s: the target %s doesn't support integrity data.",
			
 
				+				dm_device_name(tio->io->md),
			
 
				+				tio->ti->type->name);
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+
			
 
				+		r = bio_integrity_clone(clone, bio, GFP_NOIO);
			
 
				 		if (r < 0)
			
 
				 			return r;
			
 
				 	}
			
@@ -1113,7 +1123,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 
				 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
			
 
				 	clone->bi_iter.bi_size = to_bytes(len);
			
 
				 
			
 
				-	if (bio_integrity(bio))
			
 
				+	if (unlikely(bio_integrity(bio) != NULL))
			
 
				 		bio_integrity_trim(clone, 0, len);
			
 
				 
			
 
				 	return 0;
			
@@ -1715,6 +1725,8 @@ static void event_callback(void *context)
 
				  */
			
 
				 static void __set_size(struct mapped_device *md, sector_t size)
			
 
				 {
			
 
				+	lockdep_assert_held(&md->suspend_lock);
			
 
				+
			
 
				 	set_capacity(md->disk, size);
			
 
				 
			
 
				 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
			
@@ -1822,13 +1834,13 @@ void dm_unlock_md_type(struct mapped_device *md)
 
				 	mutex_unlock(&md->type_lock);
			
 
				 }
			
 
				 
			
 
				-void dm_set_md_type(struct mapped_device *md, unsigned type)
			
 
				+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
			
 
				 {
			
 
				 	BUG_ON(!mutex_is_locked(&md->type_lock));
			
 
				 	md->type = type;
			
 
				 }
			
 
				 
			
 
				-unsigned dm_get_md_type(struct mapped_device *md)
			
 
				+enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
			
 
				 {
			
 
				 	return md->type;
			
 
				 }
			
@@ -1855,7 +1867,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
				 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
			
 
				 {
			
 
				 	int r;
			
 
				-	unsigned type = dm_get_md_type(md);
			
 
				+	enum dm_queue_mode type = dm_get_md_type(md);
			
 
				 
			
 
				 	switch (type) {
			
 
				 	case DM_TYPE_REQUEST_BASED:
			
@@ -1886,6 +1898,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 
				 		if (type == DM_TYPE_DAX_BIO_BASED)
			
 
				 			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
			
 
				 		break;
			
 
				+	case DM_TYPE_NONE:
			
 
				+		WARN_ON_ONCE(true);
			
 
				+		break;
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -2164,8 +2179,6 @@ static void unlock_fs(struct mapped_device *md)
 
				  * If __dm_suspend returns 0, the device is completely quiescent
			
 
				  * now. There is no request-processing activity. All new requests
			
 
				  * are being added to md->deferred list.
			
 
				- *
			
 
				- * Caller must hold md->suspend_lock
			
 
				  */
			
 
				 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
			
 
				 			unsigned suspend_flags, long task_state,
			
@@ -2183,6 +2196,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 
				 	 */
			
 
				 	if (noflush)
			
 
				 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
			
 
				+	else
			
 
				+		pr_debug("%s: suspending with flush\n", dm_device_name(md));
			
 
				 
			
 
				 	/*
			
 
				 	 * This gets reverted if there's an error later and the targets
			
@@ -2381,6 +2396,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
 
				 {
			
 
				 	struct dm_table *map = NULL;
			
 
				 
			
 
				+	lockdep_assert_held(&md->suspend_lock);
			
 
				+
			
 
				 	if (md->internal_suspend_count++)
			
 
				 		return; /* nested internal suspend */
			
 
				 
			
@@ -2571,7 +2588,7 @@ int dm_noflush_suspending(struct dm_target *ti)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
			
 
				 
			
 
				-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
			
 
				+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
			
 
				 					    unsigned integrity, unsigned per_io_data_size)
			
 
				 {
			
 
				 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
			
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 
				 void dm_table_postsuspend_targets(struct dm_table *t);
			
 
				 int dm_table_resume_targets(struct dm_table *t);
			
 
				 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
			
 
				-unsigned dm_table_get_type(struct dm_table *t);
			
 
				+enum dm_queue_mode dm_table_get_type(struct dm_table *t);
			
 
				 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
			
 
				 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
			
 
				 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
			
@@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
				 
			
 
				 void dm_lock_md_type(struct mapped_device *md);
			
 
				 void dm_unlock_md_type(struct mapped_device *md);
			
 
				-void dm_set_md_type(struct mapped_device *md, unsigned type);
			
 
				-unsigned dm_get_md_type(struct mapped_device *md);
			
 
				+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
			
 
				+enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
			
 
				 struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
			
 
				 
			
 
				 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
			
@@ -204,7 +204,7 @@ void dm_kcopyd_exit(void);
 
				 /*
			
 
				  * Mempool operations
			
 
				  */
			
 
				-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
			
 
				+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
			
 
				 					    unsigned integrity, unsigned per_bio_data_size);
			
 
				 void dm_free_md_mempools(struct dm_md_mempools *pools);
			
 
				 
			
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -378,7 +378,6 @@ struct dm_block_manager {
 
				 
			
 
				 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
			
 
				 						 unsigned block_size,
			
 
				-						 unsigned cache_size,
			
 
				 						 unsigned max_held_per_thread)
			
 
				 {
			
 
				 	int r;
			
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b);
 
				 struct dm_block_manager;
			
 
				 struct dm_block_manager *dm_block_manager_create(
			
 
				 	struct block_device *bdev, unsigned block_size,
			
 
				-	unsigned cache_size, unsigned max_held_per_thread);
			
 
				+	unsigned max_held_per_thread);
			
 
				 void dm_block_manager_destroy(struct dm_block_manager *bm);
			
 
				 
			
 
				 unsigned dm_bm_block_size(struct dm_block_manager *bm);
			
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -902,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
 
				 		else
			
 
				 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
			
 
				 
			
 
				-		if (next_block || flags & INTERNAL_NODE)
			
 
				-			block = value64(ro_node(s), i);
			
 
				+		if (next_block || flags & INTERNAL_NODE) {
			
 
				+			if (find_highest)
			
 
				+				block = value64(ro_node(s), i);
			
 
				+			else
			
 
				+				block = value64(ro_node(s), 0);
			
 
				+		}
			
 
				 
			
 
				 	} while (flags & INTERNAL_NODE);
			
 
				 
			
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -54,16 +54,6 @@
 
				  */
			
 
				 #define R5L_POOL_SIZE	4
			
 
				 
			
 
				-/*
			
 
				- * r5c journal modes of the array: write-back or write-through.
			
 
				- * write-through mode has identical behavior as existing log only
			
 
				- * implementation.
			
 
				- */
			
 
				-enum r5c_journal_mode {
			
 
				-	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
			
 
				-	R5C_JOURNAL_MODE_WRITE_BACK = 1,
			
 
				-};
			
 
				-
			
 
				 static char *r5c_journal_mode_str[] = {"write-through",
			
 
				 				       "write-back"};
			
 
				 /*
			
@@ -2526,40 +2516,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static ssize_t r5c_journal_mode_store(struct mddev *mddev,
			
 
				-				      const char *page, size_t length)
			
 
				+/*
			
 
				+ * Set journal cache mode on @mddev (external API initially needed by dm-raid).
			
 
				+ *
			
 
				+ * @mode as defined in 'enum r5c_journal_mode'.
			
 
				+ *
			
 
				+ */
			
 
				+int r5c_journal_mode_set(struct mddev *mddev, int mode)
			
 
				 {
			
 
				 	struct r5conf *conf = mddev->private;
			
 
				 	struct r5l_log *log = conf->log;
			
 
				-	int val = -1, i;
			
 
				-	int len = length;
			
 
				 
			
 
				 	if (!log)
			
 
				 		return -ENODEV;
			
 
				 
			
 
				-	if (len && page[len - 1] == '\n')
			
 
				-		len -= 1;
			
 
				-	for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
			
 
				-		if (strlen(r5c_journal_mode_str[i]) == len &&
			
 
				-		    strncmp(page, r5c_journal_mode_str[i], len) == 0) {
			
 
				-			val = i;
			
 
				-			break;
			
 
				-		}
			
 
				-	if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
			
 
				-	    val > R5C_JOURNAL_MODE_WRITE_BACK)
			
 
				+	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
			
 
				+	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	if (raid5_calc_degraded(conf) > 0 &&
			
 
				-	    val == R5C_JOURNAL_MODE_WRITE_BACK)
			
 
				+	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	mddev_suspend(mddev);
			
 
				-	conf->log->r5c_journal_mode = val;
			
 
				+	conf->log->r5c_journal_mode = mode;
			
 
				 	mddev_resume(mddev);
			
 
				 
			
 
				 	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
			
 
				-		 mdname(mddev), val, r5c_journal_mode_str[val]);
			
 
				-	return length;
			
 
				+		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL(r5c_journal_mode_set);
			
 
				+
			
 
				+static ssize_t r5c_journal_mode_store(struct mddev *mddev,
			
 
				+				      const char *page, size_t length)
			
 
				+{
			
 
				+	int mode = ARRAY_SIZE(r5c_journal_mode_str);
			
 
				+	size_t len = length;
			
 
				+
			
 
				+	if (len < 2)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (page[len - 1] == '\n')
			
 
				+		len--;
			
 
				+
			
 
				+	while (mode--)
			
 
				+		if (strlen(r5c_journal_mode_str[mode]) == len &&
			
 
				+		    !strncmp(page, r5c_journal_mode_str[mode], len))
			
 
				+			break;
			
 
				+
			
 
				+	return r5c_journal_mode_set(mddev, mode) ?: length;
			
 
				 }
			
 
				 
			
 
				 struct md_sysfs_entry
			
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -510,6 +510,16 @@ struct r5worker_group {
 
				 	int stripes_cnt;
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * r5c journal modes of the array: write-back or write-through.
			
 
				+ * write-through mode has identical behavior as existing log only
			
 
				+ * implementation.
			
 
				+ */
			
 
				+enum r5c_journal_mode {
			
 
				+	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
			
 
				+	R5C_JOURNAL_MODE_WRITE_BACK = 1,
			
 
				+};
			
 
				+
			
 
				 enum r5_cache_state {
			
 
				 	R5_INACTIVE_BLOCKED,	/* release of inactive stripes blocked,
			
 
				 				 * waiting for 25% to be free
			
@@ -741,4 +751,5 @@ extern struct stripe_head *
 
				 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
			
 
				 			int previous, int noblock, int noquiesce);
			
 
				 extern int raid5_calc_degraded(struct r5conf *conf);
			
 
				+extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
			
 
				 #endif
			
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -22,11 +22,13 @@ struct bio_vec;
 
				 /*
			
 
				  * Type of table, mapped_device's mempool and request_queue
			
 
				  */
			
 
				-#define DM_TYPE_NONE			0
			
 
				-#define DM_TYPE_BIO_BASED		1
			
 
				-#define DM_TYPE_REQUEST_BASED		2
			
 
				-#define DM_TYPE_MQ_REQUEST_BASED	3
			
 
				-#define DM_TYPE_DAX_BIO_BASED		4
			
 
				+enum dm_queue_mode {
			
 
				+	DM_TYPE_NONE		 = 0,
			
 
				+	DM_TYPE_BIO_BASED	 = 1,
			
 
				+	DM_TYPE_REQUEST_BASED	 = 2,
			
 
				+	DM_TYPE_MQ_REQUEST_BASED = 3,
			
 
				+	DM_TYPE_DAX_BIO_BASED	 = 4,
			
 
				+};
			
 
				 
			
 
				 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
			
 
				 
			
@@ -221,6 +223,18 @@ struct target_type {
 
				  */
			
 
				 typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
			
 
				 
			
 
				+/*
			
 
				+ * A target implements own bio data integrity.
			
 
				+ */
			
 
				+#define DM_TARGET_INTEGRITY		0x00000010
			
 
				+#define dm_target_has_integrity(type)	((type)->features & DM_TARGET_INTEGRITY)
			
 
				+
			
 
				+/*
			
 
				+ * A target passes integrity data to the lower device.
			
 
				+ */
			
 
				+#define DM_TARGET_PASSES_INTEGRITY	0x00000020
			
 
				+#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
			
 
				+
			
 
				 struct dm_target {
			
 
				 	struct dm_table *table;
			
 
				 	struct target_type *type;
			
@@ -465,7 +479,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
 
				  * Useful for "hybrid" target (supports both bio-based
			
 
				  * and request-based).
			
 
				  */
			
 
				-void dm_table_set_type(struct dm_table *t, unsigned type);
			
 
				+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
			
 
				 
			
 
				 /*
			
 
				  * Finally call this to make the table ready for use.