Эх сурвалжийг харах

Merge tag 'dm-4.8-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - initially based on Jens' 'for-4.8/core' (given all the flag churn)
   and later merged with 'for-4.8/core' to pickup the QUEUE_FLAG_DAX
   commits that DM depends on to provide its DAX support

 - clean up the bio-based vs request-based DM core code by moving the
   request-based DM core code out to dm-rq.[hc]

 - reinstate bio-based support in the DM multipath target (done with the
   idea that fast storage like NVMe over Fabrics could benefit) -- while
   preserving support for request_fn and blk-mq request-based DM mpath

 - SCSI and DM multipath persistent reservation fixes that were
   coordinated with Martin Petersen.

 - the DM raid target saw the most extensive change this cycle; it now
   provides reshape and takeover support (by layering ontop of the
   corresponding MD capabilities)

 - DAX support for DM core and the linear, stripe and error targets

 - a DM thin-provisioning block discard vs allocation race fix that
   addresses potential for corruption

 - a stable fix for DM verity-fec's block calculation during decode

 - a few cleanups and fixes to DM core and various targets

* tag 'dm-4.8-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (73 commits)
  dm: allow bio-based table to be upgraded to bio-based with DAX support
  dm snap: add fake origin_direct_access
  dm stripe: add DAX support
  dm error: add DAX support
  dm linear: add DAX support
  dm: add infrastructure for DAX support
  dm thin: fix a race condition between discarding and provisioning a block
  dm btree: fix a bug in dm_btree_find_next_single()
  dm raid: fix random optimal_io_size for raid0
  dm raid: address checkpatch.pl complaints
  dm: call PR reserve/unreserve on each underlying device
  sd: don't use the ALL_TG_PT bit for reservations
  dm: fix second blk_delay_queue() parameter to be in msec units not jiffies
  dm raid: change logical functions to actually return bool
  dm raid: use rdev_for_each in status
  dm raid: use rs->raid_disks to avoid memory leaks on free
  dm raid: support delta_disks for raid1, fix table output
  dm raid: enhance reshape check and factor out reshape setup
  dm raid: allow resize during recovery
  dm raid: fix rs_is_recovering() to allow for lvextend
  ...
Linus Torvalds 9 жил өмнө
parent
commit
f7e6816994

+ 55 - 3
Documentation/device-mapper/dm-raid.txt

@@ -14,8 +14,12 @@ The target is named "raid" and it accepts the following parameters:
     <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
     <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
 
 
 <raid_type>:
 <raid_type>:
+  raid0		RAID0 striping (no resilience)
   raid1		RAID1 mirroring
   raid1		RAID1 mirroring
-  raid4		RAID4 dedicated parity disk
+  raid4		RAID4 with dedicated last parity disk
+  raid5_n 	RAID5 with dedicated last parity disk suporting takeover
+		Same as raid4
+		-Transitory layout
   raid5_la	RAID5 left asymmetric
   raid5_la	RAID5 left asymmetric
 		- rotating parity 0 with data continuation
 		- rotating parity 0 with data continuation
   raid5_ra	RAID5 right asymmetric
   raid5_ra	RAID5 right asymmetric
@@ -30,7 +34,19 @@ The target is named "raid" and it accepts the following parameters:
 		- rotating parity N (right-to-left) with data restart
 		- rotating parity N (right-to-left) with data restart
   raid6_nc	RAID6 N continue
   raid6_nc	RAID6 N continue
 		- rotating parity N (right-to-left) with data continuation
 		- rotating parity N (right-to-left) with data continuation
+  raid6_n_6	RAID6 with dedicate parity disks
+		- parity and Q-syndrome on the last 2 disks;
+		  laylout for takeover from/to raid4/raid5_n
+  raid6_la_6	Same as "raid_la" plus dedicated last Q-syndrome disk
+		- layout for takeover from raid5_la from/to raid6
+  raid6_ra_6	Same as "raid5_ra" dedicated last Q-syndrome disk
+		- layout for takeover from raid5_ra from/to raid6
+  raid6_ls_6	Same as "raid5_ls" dedicated last Q-syndrome disk
+		- layout for takeover from raid5_ls from/to raid6
+  raid6_rs_6	Same as "raid5_rs" dedicated last Q-syndrome disk
+		- layout for takeover from raid5_rs from/to raid6
   raid10        Various RAID10 inspired algorithms chosen by additional params
   raid10        Various RAID10 inspired algorithms chosen by additional params
+		(see raid10_format and raid10_copies below)
 		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
 		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
 		- RAID1E: Integrated Adjacent Stripe Mirroring
 		- RAID1E: Integrated Adjacent Stripe Mirroring
 		- RAID1E: Integrated Offset Stripe Mirroring
 		- RAID1E: Integrated Offset Stripe Mirroring
@@ -116,10 +132,41 @@ The target is named "raid" and it accepts the following parameters:
 		Here we see layouts closely akin to 'RAID1E - Integrated
 		Here we see layouts closely akin to 'RAID1E - Integrated
 		Offset Stripe Mirroring'.
 		Offset Stripe Mirroring'.
 
 
+        [delta_disks <N>]
+		The delta_disks option value (-251 < N < +251) triggers
+		device removal (negative value) or device addition (positive
+		value) to any reshape supporting raid levels 4/5/6 and 10.
+		RAID levels 4/5/6 allow for addition of devices (metadata
+		and data device tupel), raid10_near and raid10_offset only
+		allow for device addtion. raid10_far does not support any
+		reshaping at all.
+		A minimum of devices have to be kept to enforce resilience,
+		which is 3 devices for raid4/5 and 4 devices for raid6.
+
+        [data_offset <sectors>]
+		This option value defines the offset into each data device
+		where the data starts. This is used to provide out-of-place
+		reshaping space to avoid writing over data whilst
+		changing the layout of stripes, hence an interruption/crash
+		may happen at any time without the risk of losing data.
+		E.g. when adding devices to an existing raid set during
+		forward reshaping, the out-of-place space will be allocated
+		at the beginning of each raid device. The kernel raid4/5/6/10
+		MD personalities supporting such device addition will read the data from
+		the existing first stripes (those with smaller number of stripes)
+		starting at data_offset to fill up a new stripe with the larger
+		number of stripes, calculate the redundancy blocks (CRC/Q-syndrome)
+		and write that new stripe to offset 0. Same will be applied to all
+		N-1 other new stripes. This out-of-place scheme is used to change
+		the RAID type (i.e. the allocation algorithm) as well, e.g.
+		changing from raid5_ls to raid5_n.
+
 <#raid_devs>: The number of devices composing the array.
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
 	containing the metadata (if any); the second is the one containing the
-	data.
+	data. A Maximum of 64 metadata/data device entries are supported
+	up to target version 1.8.0.
+	1.9.0 supports up to 253 which is enforced by the used MD kernel runtime.
 
 
 	If a drive has failed or is missing at creation time, a '-' can be
 	If a drive has failed or is missing at creation time, a '-' can be
 	given for both the metadata and data drives for a given position.
 	given for both the metadata and data drives for a given position.
@@ -207,7 +254,6 @@ include:
 	"recover"- Initiate/continue a recover process.
 	"recover"- Initiate/continue a recover process.
 	"check"  - Initiate a check (i.e. a "scrub") of the array.
 	"check"  - Initiate a check (i.e. a "scrub") of the array.
 	"repair" - Initiate a repair of the array.
 	"repair" - Initiate a repair of the array.
-	"reshape"- Currently unsupported (-EINVAL).
 
 
 
 
 Discard Support
 Discard Support
@@ -257,3 +303,9 @@ Version History
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
 1.6.0   Add discard support (and devices_handle_discard_safely module param).
 1.6.0   Add discard support (and devices_handle_discard_safely module param).
 1.7.0   Add support for MD RAID0 mappings.
 1.7.0   Add support for MD RAID0 mappings.
+1.8.0   Explictely check for compatible flags in the superblock metadata
+	and reject to start the raid set if any are set by a newer
+	target version, thus avoiding data corruption on a raid set
+	with a reshape in progress.
+1.9.0   Add support for RAID level takeover/reshape/region size
+	and set size reduction.

+ 2 - 1
drivers/md/Makefile

@@ -3,7 +3,8 @@
 #
 #
 
 
 dm-mod-y	+= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
 dm-mod-y	+= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
-		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o
+		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \
+		   dm-rq.o
 dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 		    dm-snap-persistent.o

+ 1 - 1
drivers/md/dm-builtin.c

@@ -1,4 +1,4 @@
-#include "dm.h"
+#include "dm-core.h"
 
 
 /*
 /*
  * The kobject release method must not be placed in the module itself,
  * The kobject release method must not be placed in the module itself,

+ 149 - 0
drivers/md/dm-core.h

@@ -0,0 +1,149 @@
+/*
+ * Internal header file _only_ for device mapper core
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_CORE_INTERNAL_H
+#define DM_CORE_INTERNAL_H
+
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "dm.h"
+
+#define DM_RESERVED_MAX_IOS		1024
+
+struct dm_kobject_holder {
+	struct kobject kobj;
+	struct completion completion;
+};
+
+/*
+ * DM core internal structure that used directly by dm.c and dm-rq.c
+ * DM targets must _not_ deference a mapped_device to directly access its members!
+ */
+struct mapped_device {
+	struct srcu_struct io_barrier;
+	struct mutex suspend_lock;
+
+	/*
+	 * The current mapping (struct dm_table *).
+	 * Use dm_get_live_table{_fast} or take suspend_lock for
+	 * dereference.
+	 */
+	void __rcu *map;
+
+	struct list_head table_devices;
+	struct mutex table_devices_lock;
+
+	unsigned long flags;
+
+	struct request_queue *queue;
+	int numa_node_id;
+
+	unsigned type;
+	/* Protect queue and type against concurrent access. */
+	struct mutex type_lock;
+
+	atomic_t holders;
+	atomic_t open_count;
+
+	struct dm_target *immutable_target;
+	struct target_type *immutable_target_type;
+
+	struct gendisk *disk;
+	char name[16];
+
+	void *interface_ptr;
+
+	/*
+	 * A list of ios that arrived while we were suspended.
+	 */
+	atomic_t pending[2];
+	wait_queue_head_t wait;
+	struct work_struct work;
+	spinlock_t deferred_lock;
+	struct bio_list deferred;
+
+	/*
+	 * Event handling.
+	 */
+	wait_queue_head_t eventq;
+	atomic_t event_nr;
+	atomic_t uevent_seq;
+	struct list_head uevent_list;
+	spinlock_t uevent_lock; /* Protect access to uevent_list */
+
+	/* the number of internal suspends */
+	unsigned internal_suspend_count;
+
+	/*
+	 * Processing queue (flush)
+	 */
+	struct workqueue_struct *wq;
+
+	/*
+	 * io objects are allocated from here.
+	 */
+	mempool_t *io_pool;
+	mempool_t *rq_pool;
+
+	struct bio_set *bs;
+
+	/*
+	 * freeze/thaw support require holding onto a super block
+	 */
+	struct super_block *frozen_sb;
+
+	/* forced geometry settings */
+	struct hd_geometry geometry;
+
+	struct block_device *bdev;
+
+	/* kobject and completion */
+	struct dm_kobject_holder kobj_holder;
+
+	/* zero-length flush that will be cloned and submitted to targets */
+	struct bio flush_bio;
+
+	struct dm_stats stats;
+
+	struct kthread_worker kworker;
+	struct task_struct *kworker_task;
+
+	/* for request-based merge heuristic in dm_request_fn() */
+	unsigned seq_rq_merge_deadline_usecs;
+	int last_rq_rw;
+	sector_t last_rq_pos;
+	ktime_t last_rq_start_time;
+
+	/* for blk-mq request-based DM support */
+	struct blk_mq_tag_set *tag_set;
+	bool use_blk_mq:1;
+	bool init_tio_pdu:1;
+};
+
+void dm_init_md_queue(struct mapped_device *md);
+void dm_init_normal_md_queue(struct mapped_device *md);
+int md_in_flight(struct mapped_device *md);
+void disable_write_same(struct mapped_device *md);
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
+unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max);
+
+static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
+{
+	return !maxlen || strlen(result) + 1 >= maxlen;
+}
+
+#endif

+ 2 - 2
drivers/md/dm-crypt.c

@@ -683,7 +683,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
 				  u8 *data)
 				  u8 *data)
 {
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
-	u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 buf[TCW_WHITENING_SIZE];
 	u8 buf[TCW_WHITENING_SIZE];
 	SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
 	SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
 	int i, r;
 	int i, r;
@@ -722,7 +722,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 			    struct dm_crypt_request *dmreq)
 			    struct dm_crypt_request *dmreq)
 {
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
-	u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 *src;
 	u8 *src;
 	int r = 0;
 	int r = 0;
 
 

+ 1 - 1
drivers/md/dm-io.c

@@ -5,7 +5,7 @@
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
-#include "dm.h"
+#include "dm-core.h"
 
 
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 
 

+ 17 - 14
drivers/md/dm-ioctl.c

@@ -5,7 +5,7 @@
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
-#include "dm.h"
+#include "dm-core.h"
 
 
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
@@ -1267,6 +1267,15 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 	return dm_table_complete(table);
 }
 }
 
 
+static bool is_valid_type(unsigned cur, unsigned new)
+{
+	if (cur == new ||
+	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
+		return true;
+
+	return false;
+}
+
 static int table_load(struct dm_ioctl *param, size_t param_size)
 static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 {
 	int r;
 	int r;
@@ -1309,7 +1318,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 			DMWARN("unable to set up device queue for new table.");
 			DMWARN("unable to set up device queue for new table.");
 			goto err_unlock_md_type;
 			goto err_unlock_md_type;
 		}
 		}
-	} else if (dm_get_md_type(md) != dm_table_get_type(t)) {
+	} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
 		DMWARN("can't change device type after initial table load.");
 		DMWARN("can't change device type after initial table load.");
 		r = -EINVAL;
 		r = -EINVAL;
 		goto err_unlock_md_type;
 		goto err_unlock_md_type;
@@ -1670,8 +1679,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 	return r;
 	return r;
 }
 }
 
 
-#define DM_PARAMS_KMALLOC	0x0001	/* Params alloced with kmalloc */
-#define DM_PARAMS_VMALLOC	0x0002	/* Params alloced with vmalloc */
+#define DM_PARAMS_MALLOC	0x0001	/* Params allocated with kvmalloc() */
 #define DM_WIPE_BUFFER		0x0010	/* Wipe input buffer before returning from ioctl */
 #define DM_WIPE_BUFFER		0x0010	/* Wipe input buffer before returning from ioctl */
 
 
 static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
 static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
@@ -1679,10 +1687,8 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla
 	if (param_flags & DM_WIPE_BUFFER)
 	if (param_flags & DM_WIPE_BUFFER)
 		memset(param, 0, param_size);
 		memset(param, 0, param_size);
 
 
-	if (param_flags & DM_PARAMS_KMALLOC)
-		kfree(param);
-	if (param_flags & DM_PARAMS_VMALLOC)
-		vfree(param);
+	if (param_flags & DM_PARAMS_MALLOC)
+		kvfree(param);
 }
 }
 
 
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
@@ -1714,19 +1720,14 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 	 * Use kmalloc() rather than vmalloc() when we can.
 	 * Use kmalloc() rather than vmalloc() when we can.
 	 */
 	 */
 	dmi = NULL;
 	dmi = NULL;
-	if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
+	if (param_kernel->data_size <= KMALLOC_MAX_SIZE)
 		dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 		dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-		if (dmi)
-			*param_flags |= DM_PARAMS_KMALLOC;
-	}
 
 
 	if (!dmi) {
 	if (!dmi) {
 		unsigned noio_flag;
 		unsigned noio_flag;
 		noio_flag = memalloc_noio_save();
 		noio_flag = memalloc_noio_save();
 		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
 		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
 		memalloc_noio_restore(noio_flag);
 		memalloc_noio_restore(noio_flag);
-		if (dmi)
-			*param_flags |= DM_PARAMS_VMALLOC;
 	}
 	}
 
 
 	if (!dmi) {
 	if (!dmi) {
@@ -1735,6 +1736,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
+	*param_flags |= DM_PARAMS_MALLOC;
+
 	if (copy_from_user(dmi, user, param_kernel->data_size))
 	if (copy_from_user(dmi, user, param_kernel->data_size))
 		goto bad;
 		goto bad;
 
 

+ 1 - 1
drivers/md/dm-kcopyd.c

@@ -26,7 +26,7 @@
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 #include <linux/dm-kcopyd.h>
 #include <linux/dm-kcopyd.h>
 
 
-#include "dm.h"
+#include "dm-core.h"
 
 
 #define SUB_JOB_SIZE	128
 #define SUB_JOB_SIZE	128
 #define SPLIT_COUNT	8
 #define SPLIT_COUNT	8

+ 20 - 1
drivers/md/dm-linear.c

@@ -141,9 +141,27 @@ static int linear_iterate_devices(struct dm_target *ti,
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 }
 
 
+static long linear_direct_access(struct dm_target *ti, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct blk_dax_ctl dax = {
+		.sector = linear_map_sector(ti, sector),
+		.size = size,
+	};
+	long ret;
+
+	ret = bdev_direct_access(bdev, &dax);
+	*kaddr = dax.addr;
+	*pfn = dax.pfn;
+
+	return ret;
+}
+
 static struct target_type linear_target = {
 static struct target_type linear_target = {
 	.name   = "linear",
 	.name   = "linear",
-	.version = {1, 2, 1},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
 	.dtr    = linear_dtr,
@@ -151,6 +169,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.status = linear_status,
 	.prepare_ioctl = linear_prepare_ioctl,
 	.prepare_ioctl = linear_prepare_ioctl,
 	.iterate_devices = linear_iterate_devices,
 	.iterate_devices = linear_iterate_devices,
+	.direct_access = linear_direct_access,
 };
 };
 
 
 int __init dm_linear_init(void)
 int __init dm_linear_init(void)

+ 318 - 36
drivers/md/dm-mpath.c

@@ -7,7 +7,8 @@
 
 
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 
 
-#include "dm.h"
+#include "dm-rq.h"
+#include "dm-bio-record.h"
 #include "dm-path-selector.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 #include "dm-uevent.h"
 
 
@@ -89,6 +90,8 @@ struct multipath {
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
 
+	unsigned queue_mode;
+
 	/*
 	/*
 	 * We must use a mempool of dm_mpath_io structs so that we
 	 * We must use a mempool of dm_mpath_io structs so that we
 	 * can resubmit bios on error.
 	 * can resubmit bios on error.
@@ -97,10 +100,13 @@ struct multipath {
 
 
 	struct mutex work_mutex;
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
 	struct work_struct trigger_event;
+
+	struct work_struct process_queued_bios;
+	struct bio_list queued_bios;
 };
 };
 
 
 /*
 /*
- * Context information attached to each bio we process.
+ * Context information attached to each io we process.
  */
  */
 struct dm_mpath_io {
 struct dm_mpath_io {
 	struct pgpath *pgpath;
 	struct pgpath *pgpath;
@@ -114,6 +120,7 @@ static struct kmem_cache *_mpio_cache;
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
 static void activate_path(struct work_struct *work);
+static void process_queued_bios(struct work_struct *work);
 
 
 /*-----------------------------------------------
 /*-----------------------------------------------
  * Multipath state flags.
  * Multipath state flags.
@@ -185,7 +192,7 @@ static void free_priority_group(struct priority_group *pg,
 	kfree(pg);
 	kfree(pg);
 }
 }
 
 
-static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
+static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 {
 	struct multipath *m;
 	struct multipath *m;
 
 
@@ -203,15 +210,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 		mutex_init(&m->work_mutex);
 		mutex_init(&m->work_mutex);
 
 
 		m->mpio_pool = NULL;
 		m->mpio_pool = NULL;
-		if (!use_blk_mq) {
-			unsigned min_ios = dm_get_reserved_rq_based_ios();
-
-			m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
-			if (!m->mpio_pool) {
-				kfree(m);
-				return NULL;
-			}
-		}
+		m->queue_mode = DM_TYPE_NONE;
 
 
 		m->ti = ti;
 		m->ti = ti;
 		ti->private = m;
 		ti->private = m;
@@ -220,6 +219,39 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 	return m;
 	return m;
 }
 }
 
 
+static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
+{
+	if (m->queue_mode == DM_TYPE_NONE) {
+		/*
+		 * Default to request-based.
+		 */
+		if (dm_use_blk_mq(dm_table_get_md(ti->table)))
+			m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
+		else
+			m->queue_mode = DM_TYPE_REQUEST_BASED;
+	}
+
+	if (m->queue_mode == DM_TYPE_REQUEST_BASED) {
+		unsigned min_ios = dm_get_reserved_rq_based_ios();
+
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
+		if (!m->mpio_pool)
+			return -ENOMEM;
+	}
+	else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+		INIT_WORK(&m->process_queued_bios, process_queued_bios);
+		/*
+		 * bio-based doesn't support any direct scsi_dh management;
+		 * it just discovers if a scsi_dh is attached.
+		 */
+		set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+	}
+
+	dm_table_set_type(ti->table, m->queue_mode);
+
+	return 0;
+}
+
 static void free_multipath(struct multipath *m)
 static void free_multipath(struct multipath *m)
 {
 {
 	struct priority_group *pg, *tmp;
 	struct priority_group *pg, *tmp;
@@ -272,6 +304,41 @@ static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
 	}
 	}
 }
 }
 
 
+static size_t multipath_per_bio_data_size(void)
+{
+	return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
+}
+
+static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
+{
+	return dm_per_bio_data(bio, multipath_per_bio_data_size());
+}
+
+static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio)
+{
+	/* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
+	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+	void *bio_details = mpio + 1;
+
+	return bio_details;
+}
+
+static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p,
+					struct dm_bio_details **bio_details_p)
+{
+	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+	struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
+
+	memset(mpio, 0, sizeof(*mpio));
+	memset(bio_details, 0, sizeof(*bio_details));
+	dm_bio_record(bio_details, bio);
+
+	if (mpio_p)
+		*mpio_p = mpio;
+	if (bio_details_p)
+		*bio_details_p = bio_details;
+}
+
 /*-----------------------------------------------
 /*-----------------------------------------------
  * Path selection
  * Path selection
  *-----------------------------------------------*/
  *-----------------------------------------------*/
@@ -431,16 +498,26 @@ failed:
  * and multipath_resume() calls and we have no need to check
  * and multipath_resume() calls and we have no need to check
  * for the DMF_NOFLUSH_SUSPENDING flag.
  * for the DMF_NOFLUSH_SUSPENDING flag.
  */
  */
-static int must_push_back(struct multipath *m)
+static bool __must_push_back(struct multipath *m)
+{
+	return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
+		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
+		dm_noflush_suspending(m->ti));
+}
+
+static bool must_push_back_rq(struct multipath *m)
 {
 {
 	return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
 	return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
-		((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
-		  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
-		 dm_noflush_suspending(m->ti)));
+		__must_push_back(m));
+}
+
+static bool must_push_back_bio(struct multipath *m)
+{
+	return __must_push_back(m);
 }
 }
 
 
 /*
 /*
- * Map cloned requests
+ * Map cloned requests (request-based multipath)
  */
  */
 static int __multipath_map(struct dm_target *ti, struct request *clone,
 static int __multipath_map(struct dm_target *ti, struct request *clone,
 			   union map_info *map_context,
 			   union map_info *map_context,
@@ -459,7 +536,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		pgpath = choose_pgpath(m, nr_bytes);
 		pgpath = choose_pgpath(m, nr_bytes);
 
 
 	if (!pgpath) {
 	if (!pgpath) {
-		if (!must_push_back(m))
+		if (!must_push_back_rq(m))
 			r = -EIO;	/* Failed */
 			r = -EIO;	/* Failed */
 		return r;
 		return r;
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
@@ -529,6 +606,108 @@ static void multipath_release_clone(struct request *clone)
 	blk_mq_free_request(clone);
 	blk_mq_free_request(clone);
 }
 }
 
 
+/*
+ * Map cloned bios (bio-based multipath)
+ */
+static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
+{
+	size_t nr_bytes = bio->bi_iter.bi_size;
+	struct pgpath *pgpath;
+	unsigned long flags;
+	bool queue_io;
+
+	/* Do we need to select a new pgpath? */
+	pgpath = lockless_dereference(m->current_pgpath);
+	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
+	if (!pgpath || !queue_io)
+		pgpath = choose_pgpath(m, nr_bytes);
+
+	if ((pgpath && queue_io) ||
+	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
+		/* Queue for the daemon to resubmit */
+		spin_lock_irqsave(&m->lock, flags);
+		bio_list_add(&m->queued_bios, bio);
+		spin_unlock_irqrestore(&m->lock, flags);
+		/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
+		if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
+			pg_init_all_paths(m);
+		else if (!queue_io)
+			queue_work(kmultipathd, &m->process_queued_bios);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	if (!pgpath) {
+		if (!must_push_back_bio(m))
+			return -EIO;
+		return DM_MAPIO_REQUEUE;
+	}
+
+	mpio->pgpath = pgpath;
+	mpio->nr_bytes = nr_bytes;
+
+	bio->bi_error = 0;
+	bio->bi_bdev = pgpath->path.dev->bdev;
+	bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
+
+	if (pgpath->pg->ps.type->start_io)
+		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
+					      &pgpath->path,
+					      nr_bytes);
+	return DM_MAPIO_REMAPPED;
+}
+
+static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct multipath *m = ti->private;
+	struct dm_mpath_io *mpio = NULL;
+
+	multipath_init_per_bio_data(bio, &mpio, NULL);
+
+	return __multipath_map_bio(m, bio, mpio);
+}
+
+static void process_queued_bios_list(struct multipath *m)
+{
+	if (m->queue_mode == DM_TYPE_BIO_BASED)
+		queue_work(kmultipathd, &m->process_queued_bios);
+}
+
+static void process_queued_bios(struct work_struct *work)
+{
+	int r;
+	unsigned long flags;
+	struct bio *bio;
+	struct bio_list bios;
+	struct blk_plug plug;
+	struct multipath *m =
+		container_of(work, struct multipath, process_queued_bios);
+
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (bio_list_empty(&m->queued_bios)) {
+		spin_unlock_irqrestore(&m->lock, flags);
+		return;
+	}
+
+	bio_list_merge(&bios, &m->queued_bios);
+	bio_list_init(&m->queued_bios);
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	blk_start_plug(&plug);
+	while ((bio = bio_list_pop(&bios))) {
+		r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
+		if (r < 0 || r == DM_MAPIO_REQUEUE) {
+			bio->bi_error = r;
+			bio_endio(bio);
+		} else if (r == DM_MAPIO_REMAPPED)
+			generic_make_request(bio);
+	}
+	blk_finish_plug(&plug);
+}
+
 /*
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  * If we run out of usable paths, should we queue I/O or error it?
  */
  */
@@ -557,8 +736,10 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 
 
 	spin_unlock_irqrestore(&m->lock, flags);
 	spin_unlock_irqrestore(&m->lock, flags);
 
 
-	if (!queue_if_no_path)
+	if (!queue_if_no_path) {
 		dm_table_run_md_queue_async(m->ti->table);
 		dm_table_run_md_queue_async(m->ti->table);
+		process_queued_bios_list(m);
+	}
 
 
 	return 0;
 	return 0;
 }
 }
@@ -798,6 +979,12 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 	if (!hw_argc)
 	if (!hw_argc)
 		return 0;
 		return 0;
 
 
+	if (m->queue_mode == DM_TYPE_BIO_BASED) {
+		dm_consume_args(as, hw_argc);
+		DMERR("bio-based multipath doesn't allow hardware handler args");
+		return 0;
+	}
+
 	m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
 	m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
 
 
 	if (hw_argc > 1) {
 	if (hw_argc > 1) {
@@ -833,7 +1020,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	const char *arg_name;
 	const char *arg_name;
 
 
 	static struct dm_arg _args[] = {
 	static struct dm_arg _args[] = {
-		{0, 6, "invalid number of feature args"},
+		{0, 8, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
 	};
@@ -873,6 +1060,24 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 			continue;
 			continue;
 		}
 		}
 
 
+		if (!strcasecmp(arg_name, "queue_mode") &&
+		    (argc >= 1)) {
+			const char *queue_mode_name = dm_shift_arg(as);
+
+			if (!strcasecmp(queue_mode_name, "bio"))
+				m->queue_mode = DM_TYPE_BIO_BASED;
+			else if (!strcasecmp(queue_mode_name, "rq"))
+				m->queue_mode = DM_TYPE_REQUEST_BASED;
+			else if (!strcasecmp(queue_mode_name, "mq"))
+				m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
+			else {
+				ti->error = "Unknown 'queue_mode' requested";
+				r = -EINVAL;
+			}
+			argc--;
+			continue;
+		}
+
 		ti->error = "Unrecognised multipath feature request";
 		ti->error = "Unrecognised multipath feature request";
 		r = -EINVAL;
 		r = -EINVAL;
 	} while (argc && !r);
 	} while (argc && !r);
@@ -880,8 +1085,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	return r;
 	return r;
 }
 }
 
 
-static int multipath_ctr(struct dm_target *ti, unsigned int argc,
-			 char **argv)
+static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 {
 	/* target arguments */
 	/* target arguments */
 	static struct dm_arg _args[] = {
 	static struct dm_arg _args[] = {
@@ -894,12 +1098,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	struct dm_arg_set as;
 	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
 	unsigned next_pg_num;
-	bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table));
 
 
 	as.argc = argc;
 	as.argc = argc;
 	as.argv = argv;
 	as.argv = argv;
 
 
-	m = alloc_multipath(ti, use_blk_mq);
+	m = alloc_multipath(ti);
 	if (!m) {
 	if (!m) {
 		ti->error = "can't allocate multipath";
 		ti->error = "can't allocate multipath";
 		return -EINVAL;
 		return -EINVAL;
@@ -909,6 +1112,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	if (r)
 	if (r)
 		goto bad;
 		goto bad;
 
 
+	r = alloc_multipath_stage2(ti, m);
+	if (r)
+		goto bad;
+
 	r = parse_hw_handler(&as, m);
 	r = parse_hw_handler(&as, m);
 	if (r)
 	if (r)
 		goto bad;
 		goto bad;
@@ -958,7 +1165,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	ti->num_flush_bios = 1;
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
 	ti->num_write_same_bios = 1;
-	if (use_blk_mq)
+	if (m->queue_mode == DM_TYPE_BIO_BASED)
+		ti->per_io_data_size = multipath_per_bio_data_size();
+	else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 
 	return 0;
 	return 0;
@@ -1083,8 +1292,10 @@ static int reinstate_path(struct pgpath *pgpath)
 
 
 out:
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
 	spin_unlock_irqrestore(&m->lock, flags);
-	if (run_queue)
+	if (run_queue) {
 		dm_table_run_md_queue_async(m->ti->table);
 		dm_table_run_md_queue_async(m->ti->table);
+		process_queued_bios_list(m);
+	}
 
 
 	return r;
 	return r;
 }
 }
@@ -1281,6 +1492,8 @@ static void pg_init_done(void *data, int errors)
 	}
 	}
 	clear_bit(MPATHF_QUEUE_IO, &m->flags);
 	clear_bit(MPATHF_QUEUE_IO, &m->flags);
 
 
+	process_queued_bios_list(m);
+
 	/*
 	/*
 	 * Wake up any thread waiting to suspend.
 	 * Wake up any thread waiting to suspend.
 	 */
 	 */
@@ -1328,7 +1541,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	 * during end I/O handling, since those clone requests don't have
 	 * during end I/O handling, since those clone requests don't have
 	 * bio clones.  If we queue them inside the multipath target,
 	 * bio clones.  If we queue them inside the multipath target,
 	 * we need to make bio clones, that requires memory allocation.
 	 * we need to make bio clones, that requires memory allocation.
-	 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+	 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
 	 *  don't have bio clones.)
 	 *  don't have bio clones.)
 	 * Instead of queueing the clone request here, we queue the original
 	 * Instead of queueing the clone request here, we queue the original
 	 * request into dm core, which will remake a clone request and
 	 * request into dm core, which will remake a clone request and
@@ -1347,7 +1560,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 
 
 	if (!atomic_read(&m->nr_valid_paths)) {
 	if (!atomic_read(&m->nr_valid_paths)) {
 		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
 		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!must_push_back(m))
+			if (!must_push_back_rq(m))
 				r = -EIO;
 				r = -EIO;
 		} else {
 		} else {
 			if (error == -EBADE)
 			if (error == -EBADE)
@@ -1381,6 +1594,64 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	return r;
 	return r;
 }
 }
 
 
+static int do_end_io_bio(struct multipath *m, struct bio *clone,
+			 int error, struct dm_mpath_io *mpio)
+{
+	unsigned long flags;
+
+	if (!error)
+		return 0;	/* I/O complete */
+
+	if (noretry_error(error))
+		return error;
+
+	if (mpio->pgpath)
+		fail_path(mpio->pgpath);
+
+	if (!atomic_read(&m->nr_valid_paths)) {
+		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			if (!must_push_back_bio(m))
+				return -EIO;
+			return DM_ENDIO_REQUEUE;
+		} else {
+			if (error == -EBADE)
+				return error;
+		}
+	}
+
+	/* Queue for the daemon to resubmit */
+	dm_bio_restore(get_bio_details_from_bio(clone), clone);
+
+	spin_lock_irqsave(&m->lock, flags);
+	bio_list_add(&m->queued_bios, clone);
+	spin_unlock_irqrestore(&m->lock, flags);
+	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
+		queue_work(kmultipathd, &m->process_queued_bios);
+
+	return DM_ENDIO_INCOMPLETE;
+}
+
+static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error)
+{
+	struct multipath *m = ti->private;
+	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
+	struct pgpath *pgpath;
+	struct path_selector *ps;
+	int r;
+
+	BUG_ON(!mpio);
+
+	r = do_end_io_bio(m, clone, error, mpio);
+	pgpath = mpio->pgpath;
+	if (pgpath) {
+		ps = &pgpath->pg->ps;
+		if (ps->type->end_io)
+			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
+	}
+
+	return r;
+}
+
 /*
 /*
  * Suspend can't complete until all the I/O is processed so if
  * Suspend can't complete until all the I/O is processed so if
  * the last path fails we must error any remaining I/O.
  * the last path fails we must error any remaining I/O.
@@ -1454,7 +1725,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
 			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
-			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags));
+			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
+			      (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
+
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			DMEMIT("queue_if_no_path ");
 			DMEMIT("queue_if_no_path ");
 		if (m->pg_init_retries)
 		if (m->pg_init_retries)
@@ -1463,6 +1736,16 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
 		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
 		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
 			DMEMIT("retain_attached_hw_handler ");
 			DMEMIT("retain_attached_hw_handler ");
+		if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
+			switch(m->queue_mode) {
+			case DM_TYPE_BIO_BASED:
+				DMEMIT("queue_mode bio ");
+				break;
+			case DM_TYPE_MQ_REQUEST_BASED:
+				DMEMIT("queue_mode mq ");
+				break;
+			}
+		}
 	}
 	}
 
 
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1642,6 +1925,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 			pg_init_all_paths(m);
 			pg_init_all_paths(m);
 		dm_table_run_md_queue_async(m->ti->table);
 		dm_table_run_md_queue_async(m->ti->table);
+		process_queued_bios_list(m);
 	}
 	}
 
 
 	/*
 	/*
@@ -1748,7 +2032,7 @@ static int multipath_busy(struct dm_target *ti)
  *---------------------------------------------------------------*/
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 static struct target_type multipath_target = {
 	.name = "multipath",
 	.name = "multipath",
-	.version = {1, 11, 0},
+	.version = {1, 12, 0},
 	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
 	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.ctr = multipath_ctr,
@@ -1757,6 +2041,8 @@ static struct target_type multipath_target = {
 	.clone_and_map_rq = multipath_clone_and_map,
 	.clone_and_map_rq = multipath_clone_and_map,
 	.release_clone_rq = multipath_release_clone,
 	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
 	.rq_end_io = multipath_end_io,
+	.map = multipath_map_bio,
+	.end_io = multipath_end_io_bio,
 	.presuspend = multipath_presuspend,
 	.presuspend = multipath_presuspend,
 	.postsuspend = multipath_postsuspend,
 	.postsuspend = multipath_postsuspend,
 	.resume = multipath_resume,
 	.resume = multipath_resume,
@@ -1771,14 +2057,14 @@ static int __init dm_multipath_init(void)
 {
 {
 	int r;
 	int r;
 
 
-	/* allocate a slab for the dm_ios */
+	/* allocate a slab for the dm_mpath_ios */
 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
 	if (!_mpio_cache)
 	if (!_mpio_cache)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	r = dm_register_target(&multipath_target);
 	r = dm_register_target(&multipath_target);
 	if (r < 0) {
 	if (r < 0) {
-		DMERR("register failed %d", r);
+		DMERR("request-based register failed %d", r);
 		r = -EINVAL;
 		r = -EINVAL;
 		goto bad_register_target;
 		goto bad_register_target;
 	}
 	}
@@ -1804,10 +2090,6 @@ static int __init dm_multipath_init(void)
 		goto bad_alloc_kmpath_handlerd;
 		goto bad_alloc_kmpath_handlerd;
 	}
 	}
 
 
-	DMINFO("version %u.%u.%u loaded",
-	       multipath_target.version[0], multipath_target.version[1],
-	       multipath_target.version[2]);
-
 	return 0;
 	return 0;
 
 
 bad_alloc_kmpath_handlerd:
 bad_alloc_kmpath_handlerd:

+ 2493 - 580
drivers/md/dm-raid.c

@@ -1,6 +1,6 @@
 /*
 /*
  * Copyright (C) 2010-2011 Neil Brown
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
  *
  *
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
@@ -17,7 +17,12 @@
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 
 
 #define DM_MSG_PREFIX "raid"
 #define DM_MSG_PREFIX "raid"
-#define	MAX_RAID_DEVICES	253 /* raid4/5/6 limit */
+#define	MAX_RAID_DEVICES	253 /* md-raid kernel limit */
+
+/*
+ * Minimum sectors of free reshape space per raid device
+ */
+#define	MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
 
 
 static bool devices_handle_discard_safely = false;
 static bool devices_handle_discard_safely = false;
 
 
@@ -25,12 +30,12 @@ static bool devices_handle_discard_safely = false;
  * The following flags are used by dm-raid.c to set up the array state.
  * The following flags are used by dm-raid.c to set up the array state.
  * They must be cleared before md_run is called.
  * They must be cleared before md_run is called.
  */
  */
-#define FirstUse 10             /* rdev flag */
+#define FirstUse 10		/* rdev flag */
 
 
 struct raid_dev {
 struct raid_dev {
 	/*
 	/*
 	 * Two DM devices, one to hold metadata and one to hold the
 	 * Two DM devices, one to hold metadata and one to hold the
-	 * actual data/parity.  The reason for this is to not confuse
+	 * actual data/parity.	The reason for this is to not confuse
 	 * ti->len and give more flexibility in altering size and
 	 * ti->len and give more flexibility in altering size and
 	 * characteristics.
 	 * characteristics.
 	 *
 	 *
@@ -45,26 +50,176 @@ struct raid_dev {
 	struct md_rdev rdev;
 	struct md_rdev rdev;
 };
 };
 
 
+/*
+ * Bits for establishing rs->ctr_flags
+ *
+ * 1 = no flag value
+ * 2 = flag with value
+ */
+#define __CTR_FLAG_SYNC			0  /* 1 */ /* Not with raid0! */
+#define __CTR_FLAG_NOSYNC		1  /* 1 */ /* Not with raid0! */
+#define __CTR_FLAG_REBUILD		2  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_DAEMON_SLEEP		3  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_MIN_RECOVERY_RATE	4  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_MAX_RECOVERY_RATE	5  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_MAX_WRITE_BEHIND	6  /* 2 */ /* Only with raid1! */
+#define __CTR_FLAG_WRITE_MOSTLY		7  /* 2 */ /* Only with raid1! */
+#define __CTR_FLAG_STRIPE_CACHE		8  /* 2 */ /* Only with raid4/5/6! */
+#define __CTR_FLAG_REGION_SIZE		9  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_RAID10_COPIES	10 /* 2 */ /* Only with raid10 */
+#define __CTR_FLAG_RAID10_FORMAT	11 /* 2 */ /* Only with raid10 */
+/* New for v1.9.0 */
+#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid1/4/5/6/10! */
+#define __CTR_FLAG_DATA_OFFSET		13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
+
 /*
 /*
  * Flags for rs->ctr_flags field.
  * Flags for rs->ctr_flags field.
  */
  */
-#define CTR_FLAG_SYNC              0x1
-#define CTR_FLAG_NOSYNC            0x2
-#define CTR_FLAG_REBUILD           0x4
-#define CTR_FLAG_DAEMON_SLEEP      0x8
-#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
-#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
-#define CTR_FLAG_MAX_WRITE_BEHIND  0x40
-#define CTR_FLAG_STRIPE_CACHE      0x80
-#define CTR_FLAG_REGION_SIZE       0x100
-#define CTR_FLAG_RAID10_COPIES     0x200
-#define CTR_FLAG_RAID10_FORMAT     0x400
+#define CTR_FLAG_SYNC			(1 << __CTR_FLAG_SYNC)
+#define CTR_FLAG_NOSYNC			(1 << __CTR_FLAG_NOSYNC)
+#define CTR_FLAG_REBUILD		(1 << __CTR_FLAG_REBUILD)
+#define CTR_FLAG_DAEMON_SLEEP		(1 << __CTR_FLAG_DAEMON_SLEEP)
+#define CTR_FLAG_MIN_RECOVERY_RATE	(1 << __CTR_FLAG_MIN_RECOVERY_RATE)
+#define CTR_FLAG_MAX_RECOVERY_RATE	(1 << __CTR_FLAG_MAX_RECOVERY_RATE)
+#define CTR_FLAG_MAX_WRITE_BEHIND	(1 << __CTR_FLAG_MAX_WRITE_BEHIND)
+#define CTR_FLAG_WRITE_MOSTLY		(1 << __CTR_FLAG_WRITE_MOSTLY)
+#define CTR_FLAG_STRIPE_CACHE		(1 << __CTR_FLAG_STRIPE_CACHE)
+#define CTR_FLAG_REGION_SIZE		(1 << __CTR_FLAG_REGION_SIZE)
+#define CTR_FLAG_RAID10_COPIES		(1 << __CTR_FLAG_RAID10_COPIES)
+#define CTR_FLAG_RAID10_FORMAT		(1 << __CTR_FLAG_RAID10_FORMAT)
+#define CTR_FLAG_DELTA_DISKS		(1 << __CTR_FLAG_DELTA_DISKS)
+#define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
+#define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
+
+/*
+ * Definitions of various constructor flags to
+ * be used in checks of valid / invalid flags
+ * per raid level.
+ */
+/* Define all any sync flags */
+#define	CTR_FLAGS_ANY_SYNC		(CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)
+
+/* Define flags for options without argument (e.g. 'nosync') */
+#define	CTR_FLAG_OPTIONS_NO_ARGS	(CTR_FLAGS_ANY_SYNC | \
+					 CTR_FLAG_RAID10_USE_NEAR_SETS)
+
+/* Define flags for options with one argument (e.g. 'delta_disks +2') */
+#define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
+				  CTR_FLAG_WRITE_MOSTLY | \
+				  CTR_FLAG_DAEMON_SLEEP | \
+				  CTR_FLAG_MIN_RECOVERY_RATE | \
+				  CTR_FLAG_MAX_RECOVERY_RATE | \
+				  CTR_FLAG_MAX_WRITE_BEHIND | \
+				  CTR_FLAG_STRIPE_CACHE | \
+				  CTR_FLAG_REGION_SIZE | \
+				  CTR_FLAG_RAID10_COPIES | \
+				  CTR_FLAG_RAID10_FORMAT | \
+				  CTR_FLAG_DELTA_DISKS | \
+				  CTR_FLAG_DATA_OFFSET)
+
+/* Valid options definitions per raid level... */
+
+/* "raid0" does only accept data offset */
+#define RAID0_VALID_FLAGS	(CTR_FLAG_DATA_OFFSET)
+
+/* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */
+#define RAID1_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_WRITE_MOSTLY | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET)
+
+/* "raid10" does not accept any raid1 or stripe cache options */
+#define RAID10_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_RAID10_COPIES | \
+				 CTR_FLAG_RAID10_FORMAT | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET | \
+				 CTR_FLAG_RAID10_USE_NEAR_SETS)
+
+/*
+ * "raid4/5/6" do not accept any raid1 or raid10 specific options
+ *
+ * "raid6" does not accept "nosync", because it is not guaranteed
+ * that both parity and q-syndrome are being written properly with
+ * any writes
+ */
+#define RAID45_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_STRIPE_CACHE | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET)
+
+#define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_STRIPE_CACHE | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET)
+/* ...valid options definitions per raid level */
+
+/*
+ * Flags for rs->runtime_flags field
+ * (RT_FLAG prefix meaning "runtime flag")
+ *
+ * These are all internal and used to define runtime state,
+ * e.g. to prevent another resume from preresume processing
+ * the raid set all over again.
+ */
+#define RT_FLAG_RS_PRERESUMED		0
+#define RT_FLAG_RS_RESUMED		1
+#define RT_FLAG_RS_BITMAP_LOADED	2
+#define RT_FLAG_UPDATE_SBS		3
+#define RT_FLAG_RESHAPE_RS		4
+#define RT_FLAG_KEEP_RS_FROZEN		5
+
+/* Array elements of 64 bit needed for rebuild/failed disk bits */
+#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
+
+/*
+ * raid set level, layout and chunk sectors backup/restore
+ */
+struct rs_layout {
+	int new_level;
+	int new_layout;
+	int new_chunk_sectors;
+};
 
 
 struct raid_set {
 struct raid_set {
 	struct dm_target *ti;
 	struct dm_target *ti;
 
 
 	uint32_t bitmap_loaded;
 	uint32_t bitmap_loaded;
-	uint32_t ctr_flags;
+	uint32_t stripe_cache_entries;
+	unsigned long ctr_flags;
+	unsigned long runtime_flags;
+
+	uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
+
+	int raid_disks;
+	int delta_disks;
+	int data_offset;
+	int raid10_copies;
+	int requested_bitmap_chunk_sectors;
 
 
 	struct mddev md;
 	struct mddev md;
 	struct raid_type *raid_type;
 	struct raid_type *raid_type;
@@ -73,82 +228,446 @@ struct raid_set {
 	struct raid_dev dev[0];
 	struct raid_dev dev[0];
 };
 };
 
 
+static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
+{
+	struct mddev *mddev = &rs->md;
+
+	l->new_level = mddev->new_level;
+	l->new_layout = mddev->new_layout;
+	l->new_chunk_sectors = mddev->new_chunk_sectors;
+}
+
+static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
+{
+	struct mddev *mddev = &rs->md;
+
+	mddev->new_level = l->new_level;
+	mddev->new_layout = l->new_layout;
+	mddev->new_chunk_sectors = l->new_chunk_sectors;
+}
+
+/* raid10 algorithms (i.e. formats) */
+#define	ALGORITHM_RAID10_DEFAULT	0
+#define	ALGORITHM_RAID10_NEAR		1
+#define	ALGORITHM_RAID10_OFFSET		2
+#define	ALGORITHM_RAID10_FAR		3
+
 /* Supported raid types and properties. */
 /* Supported raid types and properties. */
 static struct raid_type {
 static struct raid_type {
 	const char *name;		/* RAID algorithm. */
 	const char *name;		/* RAID algorithm. */
 	const char *descr;		/* Descriptor text for logging. */
 	const char *descr;		/* Descriptor text for logging. */
-	const unsigned parity_devs;	/* # of parity devices. */
-	const unsigned minimal_devs;	/* minimal # of devices in set. */
-	const unsigned level;		/* RAID level. */
-	const unsigned algorithm;	/* RAID algorithm. */
+	const unsigned int parity_devs;	/* # of parity devices. */
+	const unsigned int minimal_devs;/* minimal # of devices in set. */
+	const unsigned int level;	/* RAID level. */
+	const unsigned int algorithm;	/* RAID algorithm. */
 } raid_types[] = {
 } raid_types[] = {
-	{"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
-	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
-	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
-	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
-	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
-	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
-	{"raid5_ls", "RAID5 (left symmetric)",		1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
-	{"raid5_rs", "RAID5 (right symmetric)",		1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
-	{"raid6_zr", "RAID6 (zero restart)",		2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
-	{"raid6_nr", "RAID6 (N restart)",		2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
-	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+	{"raid0",	  "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
+	{"raid1",	  "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
+	{"raid10_far",	  "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
+	{"raid10_offset", "raid10 offset (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_OFFSET},
+	{"raid10_near",	  "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
+	{"raid10",	  "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
+	{"raid4",	  "raid4 (dedicated last parity disk)",	    1, 2, 4,  ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */
+	{"raid5_n",	  "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
+	{"raid5_ls",	  "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs",	  "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid5_la",	  "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra",	  "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid6_zr",	  "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
+	{"raid6_nr",	  "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc",	  "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
+	{"raid6_n_6",	  "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
+	{"raid6_ls_6",	  "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
+	{"raid6_rs_6",	  "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
+	{"raid6_la_6",	  "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
+	{"raid6_ra_6",	  "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
+};
+
+/* True, if @v is in inclusive range [@min, @max] */
+static bool __within_range(long v, long min, long max)
+{
+	return v >= min && v <= max;
+}
+
+/* All table line arguments are defined here */
+static struct arg_name_flag {
+	const unsigned long flag;
+	const char *name;
+} __arg_name_flags[] = {
+	{ CTR_FLAG_SYNC, "sync"},
+	{ CTR_FLAG_NOSYNC, "nosync"},
+	{ CTR_FLAG_REBUILD, "rebuild"},
+	{ CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"},
+	{ CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"},
+	{ CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"},
+	{ CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"},
+	{ CTR_FLAG_WRITE_MOSTLY, "write_mostly"},
+	{ CTR_FLAG_STRIPE_CACHE, "stripe_cache"},
+	{ CTR_FLAG_REGION_SIZE, "region_size"},
+	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
+	{ CTR_FLAG_RAID10_FORMAT, "raid10_format"},
+	{ CTR_FLAG_DATA_OFFSET, "data_offset"},
+	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
+	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
 };
 };
 
 
-static char *raid10_md_layout_to_format(int layout)
+/* Return argument name string for given @flag */
+static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
+{
+	if (hweight32(flag) == 1) {
+		struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags);
+
+		while (anf-- > __arg_name_flags)
+			if (flag & anf->flag)
+				return anf->name;
+
+	} else
+		DMERR("%s called with more than one flag!", __func__);
+
+	return NULL;
+}
+
+/*
+ * Bool helpers to test for various raid levels of a raid set.
+ * It's level as reported by the superblock rather than
+ * the requested raid_type passed to the constructor.
+ */
+/* Return true, if raid set in @rs is raid0 */
+static bool rs_is_raid0(struct raid_set *rs)
+{
+	return !rs->md.level;
+}
+
+/* Return true, if raid set in @rs is raid1 */
+static bool rs_is_raid1(struct raid_set *rs)
+{
+	return rs->md.level == 1;
+}
+
+/* Return true, if raid set in @rs is raid10 */
+static bool rs_is_raid10(struct raid_set *rs)
+{
+	return rs->md.level == 10;
+}
+
+/* Return true, if raid set in @rs is level 6 */
+static bool rs_is_raid6(struct raid_set *rs)
+{
+	return rs->md.level == 6;
+}
+
+/* Return true, if raid set in @rs is level 4, 5 or 6 */
+static bool rs_is_raid456(struct raid_set *rs)
+{
+	return __within_range(rs->md.level, 4, 6);
+}
+
+/* Return true, if raid set in @rs is reshapable */
+static bool __is_raid10_far(int layout);
+static bool rs_is_reshapable(struct raid_set *rs)
+{
+	return rs_is_raid456(rs) ||
+	       (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
+}
+
+/* Return true, if raid set in @rs is recovering */
+static bool rs_is_recovering(struct raid_set *rs)
+{
+	return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
+}
+
+/* Return true, if raid set in @rs is reshaping */
+static bool rs_is_reshaping(struct raid_set *rs)
+{
+	return rs->md.reshape_position != MaxSector;
+}
+
+/*
+ * bool helpers to test for various raid levels of a raid type @rt
+ */
+
+/* Return true, if raid type in @rt is raid0 */
+static bool rt_is_raid0(struct raid_type *rt)
+{
+	return !rt->level;
+}
+
+/* Return true, if raid type in @rt is raid1 */
+static bool rt_is_raid1(struct raid_type *rt)
+{
+	return rt->level == 1;
+}
+
+/* Return true, if raid type in @rt is raid10 */
+static bool rt_is_raid10(struct raid_type *rt)
+{
+	return rt->level == 10;
+}
+
+/* Return true, if raid type in @rt is raid4/5 */
+static bool rt_is_raid45(struct raid_type *rt)
+{
+	return __within_range(rt->level, 4, 5);
+}
+
+/* Return true, if raid type in @rt is raid6 */
+static bool rt_is_raid6(struct raid_type *rt)
+{
+	return rt->level == 6;
+}
+
+/* Return true, if raid type in @rt is raid4/5/6 */
+static bool rt_is_raid456(struct raid_type *rt)
+{
+	return __within_range(rt->level, 4, 6);
+}
+/* END: raid level bools */
+
+/* Return valid ctr flags for the raid level of @rs */
+static unsigned long __valid_flags(struct raid_set *rs)
+{
+	if (rt_is_raid0(rs->raid_type))
+		return RAID0_VALID_FLAGS;
+	else if (rt_is_raid1(rs->raid_type))
+		return RAID1_VALID_FLAGS;
+	else if (rt_is_raid10(rs->raid_type))
+		return RAID10_VALID_FLAGS;
+	else if (rt_is_raid45(rs->raid_type))
+		return RAID45_VALID_FLAGS;
+	else if (rt_is_raid6(rs->raid_type))
+		return RAID6_VALID_FLAGS;
+
+	return 0;
+}
+
+/*
+ * Check for valid flags set on @rs
+ *
+ * Has to be called after parsing of the ctr flags!
+ */
+static int rs_check_for_valid_flags(struct raid_set *rs)
+{
+	if (rs->ctr_flags & ~__valid_flags(rs)) {
+		rs->ti->error = "Invalid flags combination";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* MD raid10 bit definitions and helpers */
+#define RAID10_OFFSET			(1 << 16) /* stripes with data copies area adjacent on devices */
+#define RAID10_BROCKEN_USE_FAR_SETS	(1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
+#define RAID10_USE_FAR_SETS		(1 << 18) /* Use sets instead of whole stripe rotation */
+#define RAID10_FAR_COPIES_SHIFT		8	  /* raid10 # far copies shift (2nd byte of layout) */
+
+/* Return md raid10 near copies for @layout */
+static unsigned int __raid10_near_copies(int layout)
+{
+	return layout & 0xFF;
+}
+
+/* Return md raid10 far copies for @layout */
+static unsigned int __raid10_far_copies(int layout)
+{
+	return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
+}
+
+/* Return true if md raid10 offset for @layout */
+static bool __is_raid10_offset(int layout)
+{
+	return !!(layout & RAID10_OFFSET);
+}
+
+/* Return true if md raid10 near for @layout */
+static bool __is_raid10_near(int layout)
+{
+	return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1;
+}
+
+/* Return true if md raid10 far for @layout */
+static bool __is_raid10_far(int layout)
+{
+	return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1;
+}
+
+/* Return md raid10 layout string for @layout */
+static const char *raid10_md_layout_to_format(int layout)
 {
 {
 	/*
 	/*
-	 * Bit 16 and 17 stand for "offset" and "use_far_sets"
+	 * Bit 16 stands for "offset"
+	 * (i.e. adjacent stripes hold copies)
+	 *
 	 * Refer to MD's raid10.c for details
 	 * Refer to MD's raid10.c for details
 	 */
 	 */
-	if ((layout & 0x10000) && (layout & 0x20000))
+	if (__is_raid10_offset(layout))
 		return "offset";
 		return "offset";
 
 
-	if ((layout & 0xFF) > 1)
+	if (__raid10_near_copies(layout) > 1)
 		return "near";
 		return "near";
 
 
+	WARN_ON(__raid10_far_copies(layout) < 2);
+
 	return "far";
 	return "far";
 }
 }
 
 
-static unsigned raid10_md_layout_to_copies(int layout)
+/* Return md raid10 algorithm for @name */
+static int raid10_name_to_format(const char *name)
 {
 {
-	if ((layout & 0xFF) > 1)
-		return layout & 0xFF;
-	return (layout >> 8) & 0xFF;
+	if (!strcasecmp(name, "near"))
+		return ALGORITHM_RAID10_NEAR;
+	else if (!strcasecmp(name, "offset"))
+		return ALGORITHM_RAID10_OFFSET;
+	else if (!strcasecmp(name, "far"))
+		return ALGORITHM_RAID10_FAR;
+
+	return -EINVAL;
+}
+
+/* Return md raid10 copies for @layout */
+static unsigned int raid10_md_layout_to_copies(int layout)
+{
+	return max(__raid10_near_copies(layout), __raid10_far_copies(layout));
 }
 }
 
 
-static int raid10_format_to_md_layout(char *format, unsigned copies)
+/* Return md raid10 format id for @format string */
+static int raid10_format_to_md_layout(struct raid_set *rs,
+				      unsigned int algorithm,
+				      unsigned int copies)
 {
 {
-	unsigned n = 1, f = 1;
+	unsigned int n = 1, f = 1, r = 0;
 
 
-	if (!strcasecmp("near", format))
+	/*
+	 * MD resilienece flaw:
+	 *
+	 * enabling use_far_sets for far/offset formats causes copies
+	 * to be colocated on the same devs together with their origins!
+	 *
+	 * -> disable it for now in the definition above
+	 */
+	if (algorithm == ALGORITHM_RAID10_DEFAULT ||
+	    algorithm == ALGORITHM_RAID10_NEAR)
 		n = copies;
 		n = copies;
-	else
+
+	else if (algorithm == ALGORITHM_RAID10_OFFSET) {
 		f = copies;
 		f = copies;
+		r = RAID10_OFFSET;
+		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
+			r |= RAID10_USE_FAR_SETS;
 
 
-	if (!strcasecmp("offset", format))
-		return 0x30000 | (f << 8) | n;
+	} else if (algorithm == ALGORITHM_RAID10_FAR) {
+		f = copies;
+		r = !RAID10_OFFSET;
+		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
+			r |= RAID10_USE_FAR_SETS;
 
 
-	if (!strcasecmp("far", format))
-		return 0x20000 | (f << 8) | n;
+	} else
+		return -EINVAL;
 
 
-	return (f << 8) | n;
+	return r | (f << RAID10_FAR_COPIES_SHIFT) | n;
 }
 }
+/* END: MD raid10 bit definitions and helpers */
 
 
-static struct raid_type *get_raid_type(char *name)
+/* Check for any of the raid10 algorithms */
+static bool __got_raid10(struct raid_type *rtp, const int layout)
 {
 {
-	int i;
+	if (rtp->level == 10) {
+		switch (rtp->algorithm) {
+		case ALGORITHM_RAID10_DEFAULT:
+		case ALGORITHM_RAID10_NEAR:
+			return __is_raid10_near(layout);
+		case ALGORITHM_RAID10_OFFSET:
+			return __is_raid10_offset(layout);
+		case ALGORITHM_RAID10_FAR:
+			return __is_raid10_far(layout);
+		default:
+			break;
+		}
+	}
+
+	return false;
+}
+
+/* Return raid_type for @name */
+static struct raid_type *get_raid_type(const char *name)
+{
+	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
+
+	while (rtp-- > raid_types)
+		if (!strcasecmp(rtp->name, name))
+			return rtp;
+
+	return NULL;
+}
+
+/* Return raid_type for @name based derived from @level and @layout */
+static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
+{
+	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
 
 
-	for (i = 0; i < ARRAY_SIZE(raid_types); i++)
-		if (!strcmp(raid_types[i].name, name))
-			return &raid_types[i];
+	while (rtp-- > raid_types) {
+		/* RAID10 special checks based on @layout flags/properties */
+		if (rtp->level == level &&
+		    (__got_raid10(rtp, layout) || rtp->algorithm == layout))
+			return rtp;
+	}
 
 
 	return NULL;
 	return NULL;
 }
 }
 
 
-static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+/*
+ * Conditionally change bdev capacity of @rs
+ * in case of a disk add/remove reshape
+ */
+static void rs_set_capacity(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
+
+	/*
+	 * raid10 sets rdev->sector to the device size, which
+	 * is unintended in case of out-of-place reshaping
+	 */
+	rdev_for_each(rdev, mddev)
+		rdev->sectors = mddev->dev_sectors;
+
+	set_capacity(gendisk, mddev->array_sectors);
+	revalidate_disk(gendisk);
+}
+
+/*
+ * Set the mddev properties in @rs to the current
+ * ones retrieved from the freshest superblock
+ */
+static void rs_set_cur(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	mddev->new_level = mddev->level;
+	mddev->new_layout = mddev->layout;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
+}
+
+/*
+ * Set the mddev properties in @rs to the new
+ * ones requested by the ctr
+ */
+static void rs_set_new(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	mddev->level = mddev->new_level;
+	mddev->layout = mddev->new_layout;
+	mddev->chunk_sectors = mddev->new_chunk_sectors;
+	mddev->raid_disks = rs->raid_disks;
+	mddev->delta_disks = 0;
+}
+
+static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type,
+				       unsigned int raid_devs)
 {
 {
-	unsigned i;
+	unsigned int i;
 	struct raid_set *rs;
 	struct raid_set *rs;
 
 
 	if (raid_devs <= raid_type->parity_devs) {
 	if (raid_devs <= raid_type->parity_devs) {
@@ -164,15 +683,19 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 
 
 	mddev_init(&rs->md);
 	mddev_init(&rs->md);
 
 
+	rs->raid_disks = raid_devs;
+	rs->delta_disks = 0;
+
 	rs->ti = ti;
 	rs->ti = ti;
 	rs->raid_type = raid_type;
 	rs->raid_type = raid_type;
+	rs->stripe_cache_entries = 256;
 	rs->md.raid_disks = raid_devs;
 	rs->md.raid_disks = raid_devs;
 	rs->md.level = raid_type->level;
 	rs->md.level = raid_type->level;
 	rs->md.new_level = rs->md.level;
 	rs->md.new_level = rs->md.level;
 	rs->md.layout = raid_type->algorithm;
 	rs->md.layout = raid_type->algorithm;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.delta_disks = 0;
 	rs->md.delta_disks = 0;
-	rs->md.recovery_cp = 0;
+	rs->md.recovery_cp = MaxSector;
 
 
 	for (i = 0; i < raid_devs; i++)
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);
 		md_rdev_init(&rs->dev[i].rdev);
@@ -189,11 +712,11 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	return rs;
 	return rs;
 }
 }
 
 
-static void context_free(struct raid_set *rs)
+static void raid_set_free(struct raid_set *rs)
 {
 {
 	int i;
 	int i;
 
 
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		if (rs->dev[i].meta_dev)
 		if (rs->dev[i].meta_dev)
 			dm_put_device(rs->ti, rs->dev[i].meta_dev);
 			dm_put_device(rs->ti, rs->dev[i].meta_dev);
 		md_rdev_clear(&rs->dev[i].rdev);
 		md_rdev_clear(&rs->dev[i].rdev);
@@ -218,16 +741,22 @@ static void context_free(struct raid_set *rs)
  *    <meta_dev> -
  *    <meta_dev> -
  *
  *
  * This code parses those words.  If there is a failure,
  * This code parses those words.  If there is a failure,
- * the caller must use context_free to unwind the operations.
+ * the caller must use raid_set_free() to unwind the operations.
  */
  */
-static int dev_parms(struct raid_set *rs, char **argv)
+static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 {
 {
 	int i;
 	int i;
 	int rebuild = 0;
 	int rebuild = 0;
 	int metadata_available = 0;
 	int metadata_available = 0;
-	int ret = 0;
+	int r = 0;
+	const char *arg;
 
 
-	for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+	/* Put off the number of raid devices argument to get to dev pairs */
+	arg = dm_shift_arg(as);
+	if (!arg)
+		return -EINVAL;
+
+	for (i = 0; i < rs->raid_disks; i++) {
 		rs->dev[i].rdev.raid_disk = i;
 		rs->dev[i].rdev.raid_disk = i;
 
 
 		rs->dev[i].meta_dev = NULL;
 		rs->dev[i].meta_dev = NULL;
@@ -240,39 +769,49 @@ static int dev_parms(struct raid_set *rs, char **argv)
 		rs->dev[i].rdev.data_offset = 0;
 		rs->dev[i].rdev.data_offset = 0;
 		rs->dev[i].rdev.mddev = &rs->md;
 		rs->dev[i].rdev.mddev = &rs->md;
 
 
-		if (strcmp(argv[0], "-")) {
-			ret = dm_get_device(rs->ti, argv[0],
-					    dm_table_get_mode(rs->ti->table),
-					    &rs->dev[i].meta_dev);
-			rs->ti->error = "RAID metadata device lookup failure";
-			if (ret)
-				return ret;
+		arg = dm_shift_arg(as);
+		if (!arg)
+			return -EINVAL;
+
+		if (strcmp(arg, "-")) {
+			r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+					  &rs->dev[i].meta_dev);
+			if (r) {
+				rs->ti->error = "RAID metadata device lookup failure";
+				return r;
+			}
 
 
 			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
 			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
-			if (!rs->dev[i].rdev.sb_page)
+			if (!rs->dev[i].rdev.sb_page) {
+				rs->ti->error = "Failed to allocate superblock page";
 				return -ENOMEM;
 				return -ENOMEM;
+			}
 		}
 		}
 
 
-		if (!strcmp(argv[1], "-")) {
+		arg = dm_shift_arg(as);
+		if (!arg)
+			return -EINVAL;
+
+		if (!strcmp(arg, "-")) {
 			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
 			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
 			    (!rs->dev[i].rdev.recovery_offset)) {
 			    (!rs->dev[i].rdev.recovery_offset)) {
 				rs->ti->error = "Drive designated for rebuild not specified";
 				rs->ti->error = "Drive designated for rebuild not specified";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
 
 
-			rs->ti->error = "No data device supplied with metadata device";
-			if (rs->dev[i].meta_dev)
+			if (rs->dev[i].meta_dev) {
+				rs->ti->error = "No data device supplied with metadata device";
 				return -EINVAL;
 				return -EINVAL;
+			}
 
 
 			continue;
 			continue;
 		}
 		}
 
 
-		ret = dm_get_device(rs->ti, argv[1],
-				    dm_table_get_mode(rs->ti->table),
-				    &rs->dev[i].data_dev);
-		if (ret) {
+		r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+				  &rs->dev[i].data_dev);
+		if (r) {
 			rs->ti->error = "RAID device lookup failure";
 			rs->ti->error = "RAID device lookup failure";
-			return ret;
+			return r;
 		}
 		}
 
 
 		if (rs->dev[i].meta_dev) {
 		if (rs->dev[i].meta_dev) {
@@ -280,7 +819,7 @@ static int dev_parms(struct raid_set *rs, char **argv)
 			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
 			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
 		}
 		}
 		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
 		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
-		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+		list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks);
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
 			rebuild++;
 			rebuild++;
 	}
 	}
@@ -301,8 +840,7 @@ static int dev_parms(struct raid_set *rs, char **argv)
 		 *
 		 *
 		 * User could specify 'nosync' option if desperate.
 		 * User could specify 'nosync' option if desperate.
 		 */
 		 */
-		DMERR("Unable to rebuild drive while array is not in-sync");
-		rs->ti->error = "RAID device lookup failure";
+		rs->ti->error = "Unable to rebuild drive while array is not in-sync";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
@@ -325,7 +863,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 
 
 	if (!region_size) {
 	if (!region_size) {
 		/*
 		/*
-		 * Choose a reasonable default.  All figures in sectors.
+		 * Choose a reasonable default.	 All figures in sectors.
 		 */
 		 */
 		if (min_region_size > (1 << 13)) {
 		if (min_region_size > (1 << 13)) {
 			/* If not a power of 2, make it the next power of 2 */
 			/* If not a power of 2, make it the next power of 2 */
@@ -366,7 +904,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 	/*
 	/*
 	 * Convert sectors to bytes.
 	 * Convert sectors to bytes.
 	 */
 	 */
-	rs->md.bitmap_info.chunksize = (region_size << 9);
+	rs->md.bitmap_info.chunksize = to_bytes(region_size);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -382,9 +920,9 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
  */
  */
 static int validate_raid_redundancy(struct raid_set *rs)
 static int validate_raid_redundancy(struct raid_set *rs)
 {
 {
-	unsigned i, rebuild_cnt = 0;
-	unsigned rebuilds_per_group = 0, copies, d;
-	unsigned group_size, last_group_start;
+	unsigned int i, rebuild_cnt = 0;
+	unsigned int rebuilds_per_group = 0, copies;
+	unsigned int group_size, last_group_start;
 
 
 	for (i = 0; i < rs->md.raid_disks; i++)
 	for (i = 0; i < rs->md.raid_disks; i++)
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -403,7 +941,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 			goto too_many;
 			goto too_many;
 		break;
 		break;
 	case 10:
 	case 10:
-		copies = raid10_md_layout_to_copies(rs->md.layout);
+		copies = raid10_md_layout_to_copies(rs->md.new_layout);
 		if (rebuild_cnt < copies)
 		if (rebuild_cnt < copies)
 			break;
 			break;
 
 
@@ -417,17 +955,16 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 * simple case where the number of devices is a multiple of the
 		 * simple case where the number of devices is a multiple of the
 		 * number of copies, we must also handle cases where the number
 		 * number of copies, we must also handle cases where the number
 		 * of devices is not a multiple of the number of copies.
 		 * of devices is not a multiple of the number of copies.
-		 * E.g.    dev1 dev2 dev3 dev4 dev5
-		 *          A    A    B    B    C
-		 *          C    D    D    E    E
+		 * E.g.	   dev1 dev2 dev3 dev4 dev5
+		 *	    A	 A    B	   B	C
+		 *	    C	 D    D	   E	E
 		 */
 		 */
-		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
-			for (i = 0; i < rs->md.raid_disks * copies; i++) {
+		if (__is_raid10_near(rs->md.new_layout)) {
+			for (i = 0; i < rs->md.raid_disks; i++) {
 				if (!(i % copies))
 				if (!(i % copies))
 					rebuilds_per_group = 0;
 					rebuilds_per_group = 0;
-				d = i % rs->md.raid_disks;
-				if ((!rs->dev[d].rdev.sb_page ||
-				     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+				if ((!rs->dev[i].rdev.sb_page ||
+				    !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
 				    (++rebuilds_per_group >= copies))
 				    (++rebuilds_per_group >= copies))
 					goto too_many;
 					goto too_many;
 			}
 			}
@@ -442,7 +979,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 * use the 'use_far_sets' variant.)
 		 * use the 'use_far_sets' variant.)
 		 *
 		 *
 		 * This check is somewhat complicated by the need to account
 		 * This check is somewhat complicated by the need to account
-		 * for arrays that are not a multiple of (far) copies.  This
+		 * for arrays that are not a multiple of (far) copies.	This
 		 * results in the need to treat the last (potentially larger)
 		 * results in the need to treat the last (potentially larger)
 		 * set differently.
 		 * set differently.
 		 */
 		 */
@@ -475,42 +1012,48 @@ too_many:
  *
  *
  * Argument definitions
  * Argument definitions
  *    <chunk_size>			The number of sectors per disk that
  *    <chunk_size>			The number of sectors per disk that
- *                                      will form the "stripe"
+ *					will form the "stripe"
  *    [[no]sync]			Force or prevent recovery of the
  *    [[no]sync]			Force or prevent recovery of the
- *                                      entire array
+ *					entire array
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to
- *                                      clear bits
+ *					clear bits
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [write_mostly <idx>]		Indicate a write mostly drive via index
  *    [write_mostly <idx>]		Indicate a write mostly drive via index
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
- *    [region_size <sectors>]           Defines granularity of bitmap
+ *    [region_size <sectors>]		Defines granularity of bitmap
  *
  *
  * RAID10-only options:
  * RAID10-only options:
- *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
+ *    [raid10_copies <# copies>]	Number of copies.  (Default: 2)
  *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
  */
-static int parse_raid_params(struct raid_set *rs, char **argv,
-			     unsigned num_raid_params)
-{
-	char *raid10_format = "near";
-	unsigned raid10_copies = 2;
-	unsigned i;
-	unsigned long value, region_size = 0;
-	sector_t sectors_per_dev = rs->ti->len;
+static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
+			     unsigned int num_raid_params)
+{
+	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
+	unsigned int raid10_copies = 2;
+	unsigned int i, write_mostly = 0;
+	unsigned int region_size = 0;
 	sector_t max_io_len;
 	sector_t max_io_len;
-	char *key;
+	const char *arg, *key;
+	struct raid_dev *rd;
+	struct raid_type *rt = rs->raid_type;
+
+	arg = dm_shift_arg(as);
+	num_raid_params--; /* Account for chunk_size argument */
+
+	if (kstrtoint(arg, 10, &value) < 0) {
+		rs->ti->error = "Bad numerical argument given for chunk_size";
+		return -EINVAL;
+	}
 
 
 	/*
 	/*
 	 * First, parse the in-order required arguments
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 * "chunk_size" is the only argument of this type.
 	 */
 	 */
-	if ((kstrtoul(argv[0], 10, &value) < 0)) {
-		rs->ti->error = "Bad chunk size";
-		return -EINVAL;
-	} else if (rs->raid_type->level == 1) {
+	if (rt_is_raid1(rt)) {
 		if (value)
 		if (value)
 			DMERR("Ignoring chunk size parameter for RAID 1");
 			DMERR("Ignoring chunk size parameter for RAID 1");
 		value = 0;
 		value = 0;
@@ -523,8 +1066,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	}
 	}
 
 
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
-	argv++;
-	num_raid_params--;
 
 
 	/*
 	/*
 	 * We set each individual device as In_sync with a completed
 	 * We set each individual device as In_sync with a completed
@@ -532,18 +1073,18 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	 * replacement then one of the following cases applies:
 	 * replacement then one of the following cases applies:
 	 *
 	 *
 	 *   1) User specifies 'rebuild'.
 	 *   1) User specifies 'rebuild'.
-	 *      - Device is reset when param is read.
+	 *	- Device is reset when param is read.
 	 *   2) A new device is supplied.
 	 *   2) A new device is supplied.
-	 *      - No matching superblock found, resets device.
+	 *	- No matching superblock found, resets device.
 	 *   3) Device failure was transient and returns on reload.
 	 *   3) Device failure was transient and returns on reload.
-	 *      - Failure noticed, resets device for bitmap replay.
+	 *	- Failure noticed, resets device for bitmap replay.
 	 *   4) Device hadn't completed recovery after previous failure.
 	 *   4) Device hadn't completed recovery after previous failure.
-	 *      - Superblock is read and overrides recovery_offset.
+	 *	- Superblock is read and overrides recovery_offset.
 	 *
 	 *
 	 * What is found in the superblocks of the devices is always
 	 * What is found in the superblocks of the devices is always
 	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
 	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
 	 */
 	 */
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		set_bit(In_sync, &rs->dev[i].rdev.flags);
 		set_bit(In_sync, &rs->dev[i].rdev.flags);
 		rs->dev[i].rdev.recovery_offset = MaxSector;
 		rs->dev[i].rdev.recovery_offset = MaxSector;
 	}
 	}
@@ -552,72 +1093,112 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	 * Second, parse the unordered optional arguments
 	 * Second, parse the unordered optional arguments
 	 */
 	 */
 	for (i = 0; i < num_raid_params; i++) {
 	for (i = 0; i < num_raid_params; i++) {
-		if (!strcasecmp(argv[i], "nosync")) {
-			rs->md.recovery_cp = MaxSector;
-			rs->ctr_flags |= CTR_FLAG_NOSYNC;
+		key = dm_shift_arg(as);
+		if (!key) {
+			rs->ti->error = "Not enough raid parameters given";
+			return -EINVAL;
+		}
+
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) {
+			if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'nosync' argument allowed";
+				return -EINVAL;
+			}
+			continue;
+		}
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
+			if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'sync' argument allowed";
+				return -EINVAL;
+			}
 			continue;
 			continue;
 		}
 		}
-		if (!strcasecmp(argv[i], "sync")) {
-			rs->md.recovery_cp = 0;
-			rs->ctr_flags |= CTR_FLAG_SYNC;
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
+			if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
+				return -EINVAL;
+			}
 			continue;
 			continue;
 		}
 		}
 
 
-		/* The rest of the optional arguments come in key/value pairs */
-		if ((i + 1) >= num_raid_params) {
+		arg = dm_shift_arg(as);
+		i++; /* Account for the argument pairs */
+		if (!arg) {
 			rs->ti->error = "Wrong number of raid parameters given";
 			rs->ti->error = "Wrong number of raid parameters given";
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 
 
-		key = argv[i++];
+		/*
+		 * Parameters that take a string value are checked here.
+		 */
 
 
-		/* Parameters that take a string value are checked here. */
-		if (!strcasecmp(key, "raid10_format")) {
-			if (rs->raid_type->level != 10) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
+			if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
+				return -EINVAL;
+			}
+			if (!rt_is_raid10(rt)) {
 				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
 				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			if (strcmp("near", argv[i]) &&
-			    strcmp("far", argv[i]) &&
-			    strcmp("offset", argv[i])) {
+			raid10_format = raid10_name_to_format(arg);
+			if (raid10_format < 0) {
 				rs->ti->error = "Invalid 'raid10_format' value given";
 				rs->ti->error = "Invalid 'raid10_format' value given";
-				return -EINVAL;
+				return raid10_format;
 			}
 			}
-			raid10_format = argv[i];
-			rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
 			continue;
 			continue;
 		}
 		}
 
 
-		if (kstrtoul(argv[i], 10, &value) < 0) {
+		if (kstrtoint(arg, 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 
 
-		/* Parameters that take a numeric value are checked here */
-		if (!strcasecmp(key, "rebuild")) {
-			if (value >= rs->md.raid_disks) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) {
+			/*
+			 * "rebuild" is being passed in by userspace to provide
+			 * indexes of replaced devices and to set up additional
+			 * devices on raid level takeover.
+			 */
+			if (!__within_range(value, 0, rs->raid_disks - 1)) {
 				rs->ti->error = "Invalid rebuild index given";
 				rs->ti->error = "Invalid rebuild index given";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			clear_bit(In_sync, &rs->dev[value].rdev.flags);
-			rs->dev[value].rdev.recovery_offset = 0;
-			rs->ctr_flags |= CTR_FLAG_REBUILD;
-		} else if (!strcasecmp(key, "write_mostly")) {
-			if (rs->raid_type->level != 1) {
+
+			if (test_and_set_bit(value, (void *) rs->rebuild_disks)) {
+				rs->ti->error = "rebuild for this index already given";
+				return -EINVAL;
+			}
+
+			rd = rs->dev + value;
+			clear_bit(In_sync, &rd->rdev.flags);
+			clear_bit(Faulty, &rd->rdev.flags);
+			rd->rdev.recovery_offset = 0;
+			set_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags);
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
+			if (!rt_is_raid1(rt)) {
 				rs->ti->error = "write_mostly option is only valid for RAID1";
 				rs->ti->error = "write_mostly option is only valid for RAID1";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			if (value >= rs->md.raid_disks) {
-				rs->ti->error = "Invalid write_mostly drive index given";
+
+			if (!__within_range(value, 0, rs->md.raid_disks - 1)) {
+				rs->ti->error = "Invalid write_mostly index given";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
+
+			write_mostly++;
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
-		} else if (!strcasecmp(key, "max_write_behind")) {
-			if (rs->raid_type->level != 1) {
+			set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
+			if (!rt_is_raid1(rt)) {
 				rs->ti->error = "max_write_behind option is only valid for RAID1";
 				rs->ti->error = "max_write_behind option is only valid for RAID1";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
+
+			if (test_and_set_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) {
+				rs->ti->error = "Only one max_write_behind argument pair allowed";
+				return -EINVAL;
+			}
 
 
 			/*
 			/*
 			 * In device-mapper, we specify things in sectors, but
 			 * In device-mapper, we specify things in sectors, but
@@ -628,65 +1209,122 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "Max write-behind limit out of range";
 				rs->ti->error = "Max write-behind limit out of range";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
+
 			rs->md.bitmap_info.max_write_behind = value;
 			rs->md.bitmap_info.max_write_behind = value;
-		} else if (!strcasecmp(key, "daemon_sleep")) {
-			rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
+			if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) {
+				rs->ti->error = "Only one daemon_sleep argument pair allowed";
+				return -EINVAL;
+			}
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
 				rs->ti->error = "daemon sleep period out of range";
 				rs->ti->error = "daemon sleep period out of range";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
 			rs->md.bitmap_info.daemon_sleep = value;
 			rs->md.bitmap_info.daemon_sleep = value;
-		} else if (!strcasecmp(key, "stripe_cache")) {
-			rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) {
+			/* Userspace passes new data_offset after having extended the the data image LV */
+			if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
+				rs->ti->error = "Only one data_offset argument pair allowed";
+				return -EINVAL;
+			}
+			/* Ensure sensible data offset */
+			if (value < 0 ||
+			    (value && (value < MIN_FREE_RESHAPE_SPACE || value % to_sector(PAGE_SIZE)))) {
+				rs->ti->error = "Bogus data_offset value";
+				return -EINVAL;
+			}
+			rs->data_offset = value;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS))) {
+			/* Define the +/-# of disks to add to/remove from the given raid set */
+			if (test_and_set_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
+				rs->ti->error = "Only one delta_disks argument pair allowed";
+				return -EINVAL;
+			}
+			/* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */
+			if (!__within_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) {
+				rs->ti->error = "Too many delta_disk requested";
+				return -EINVAL;
+			}
 
 
-			/*
-			 * In device-mapper, we specify things in sectors, but
-			 * MD records this value in kB
-			 */
-			value /= 2;
+			rs->delta_disks = value;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE))) {
+			if (test_and_set_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one stripe_cache argument pair allowed";
+				return -EINVAL;
+			}
 
 
-			if ((rs->raid_type->level != 5) &&
-			    (rs->raid_type->level != 6)) {
+			if (!rt_is_raid456(rt)) {
 				rs->ti->error = "Inappropriate argument: stripe_cache";
 				rs->ti->error = "Inappropriate argument: stripe_cache";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			if (raid5_set_cache_size(&rs->md, (int)value)) {
-				rs->ti->error = "Bad stripe_cache size";
+
+			rs->stripe_cache_entries = value;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
+			if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one min_recovery_rate argument pair allowed";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-		} else if (!strcasecmp(key, "min_recovery_rate")) {
-			rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
 			if (value > INT_MAX) {
 			if (value > INT_MAX) {
 				rs->ti->error = "min_recovery_rate out of range";
 				rs->ti->error = "min_recovery_rate out of range";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
 			rs->md.sync_speed_min = (int)value;
 			rs->md.sync_speed_min = (int)value;
-		} else if (!strcasecmp(key, "max_recovery_rate")) {
-			rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
+			if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one max_recovery_rate argument pair allowed";
+				return -EINVAL;
+			}
 			if (value > INT_MAX) {
 			if (value > INT_MAX) {
 				rs->ti->error = "max_recovery_rate out of range";
 				rs->ti->error = "max_recovery_rate out of range";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
 			rs->md.sync_speed_max = (int)value;
 			rs->md.sync_speed_max = (int)value;
-		} else if (!strcasecmp(key, "region_size")) {
-			rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) {
+			if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one region_size argument pair allowed";
+				return -EINVAL;
+			}
+
 			region_size = value;
 			region_size = value;
-		} else if (!strcasecmp(key, "raid10_copies") &&
-			   (rs->raid_type->level == 10)) {
-			if ((value < 2) || (value > 0xFF)) {
+			rs->requested_bitmap_chunk_sectors = value;
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) {
+			if (test_and_set_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
+				rs->ti->error = "Only one raid10_copies argument pair allowed";
+				return -EINVAL;
+			}
+
+			if (!__within_range(value, 2, rs->md.raid_disks)) {
 				rs->ti->error = "Bad value for 'raid10_copies'";
 				rs->ti->error = "Bad value for 'raid10_copies'";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
+
 			raid10_copies = value;
 			raid10_copies = value;
 		} else {
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			DMERR("Unable to parse RAID parameter: %s", key);
-			rs->ti->error = "Unable to parse RAID parameters";
+			rs->ti->error = "Unable to parse RAID parameter";
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 	}
 	}
 
 
-	if (validate_region_size(rs, region_size))
+	if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) &&
+	    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+		rs->ti->error = "sync and nosync are mutually exclusive";
+		return -EINVAL;
+	}
+
+	if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) &&
+	    (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ||
+	     test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))) {
+		rs->ti->error = "sync/nosync and rebuild are mutually exclusive";
+		return -EINVAL;
+	}
+
+	if (write_mostly >= rs->md.raid_disks) {
+		rs->ti->error = "Can't set all raid1 devices to write_mostly";
+		return -EINVAL;
+	}
+
+	if (validate_region_size(rs, region_size))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	if (rs->md.chunk_sectors)
 	if (rs->md.chunk_sectors)
@@ -697,47 +1335,193 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	if (dm_set_target_max_io_len(rs->ti, max_io_len))
 	if (dm_set_target_max_io_len(rs->ti, max_io_len))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (rs->raid_type->level == 10) {
+	if (rt_is_raid10(rt)) {
 		if (raid10_copies > rs->md.raid_disks) {
 		if (raid10_copies > rs->md.raid_disks) {
 			rs->ti->error = "Not enough devices to satisfy specification";
 			rs->ti->error = "Not enough devices to satisfy specification";
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 
 
-		/*
-		 * If the format is not "near", we only support
-		 * two copies at the moment.
-		 */
-		if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
-			rs->ti->error = "Too many copies for given RAID10 format.";
+		rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
+		if (rs->md.new_layout < 0) {
+			rs->ti->error = "Error getting raid10 format";
+			return rs->md.new_layout;
+		}
+
+		rt = get_raid_type_by_ll(10, rs->md.new_layout);
+		if (!rt) {
+			rs->ti->error = "Failed to recognize new raid10 layout";
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 
 
-		/* (Len * #mirrors) / #devices */
-		sectors_per_dev = rs->ti->len * raid10_copies;
-		sector_div(sectors_per_dev, rs->md.raid_disks);
-
-		rs->md.layout = raid10_format_to_md_layout(raid10_format,
-							   raid10_copies);
-		rs->md.new_layout = rs->md.layout;
-	} else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
-		   sector_div(sectors_per_dev,
-			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
-		rs->ti->error = "Target length not divisible by number of data devices";
-		return -EINVAL;
+		if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT ||
+		     rt->algorithm == ALGORITHM_RAID10_NEAR) &&
+		    test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
+			rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
+			return -EINVAL;
+		}
 	}
 	}
-	rs->md.dev_sectors = sectors_per_dev;
+
+	rs->raid10_copies = raid10_copies;
 
 
 	/* Assume there are no metadata devices until the drives are parsed */
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.persistent = 0;
 	rs->md.external = 1;
 	rs->md.external = 1;
 
 
+	/* Check, if any invalid ctr arguments have been passed in for the raid level */
+	return rs_check_for_valid_flags(rs);
+}
+
+/* Set raid4/5/6 cache size */
+static int rs_set_raid456_stripe_cache(struct raid_set *rs)
+{
+	int r;
+	struct r5conf *conf;
+	struct mddev *mddev = &rs->md;
+	uint32_t min_stripes = max(mddev->chunk_sectors, mddev->new_chunk_sectors) / 2;
+	uint32_t nr_stripes = rs->stripe_cache_entries;
+
+	if (!rt_is_raid456(rs->raid_type)) {
+		rs->ti->error = "Inappropriate raid level; cannot change stripe_cache size";
+		return -EINVAL;
+	}
+
+	if (nr_stripes < min_stripes) {
+		DMINFO("Adjusting requested %u stripe cache entries to %u to suit stripe size",
+		       nr_stripes, min_stripes);
+		nr_stripes = min_stripes;
+	}
+
+	conf = mddev->private;
+	if (!conf) {
+		rs->ti->error = "Cannot change stripe_cache size on inactive RAID set";
+		return -EINVAL;
+	}
+
+	/* Try setting number of stripes in raid456 stripe cache */
+	if (conf->min_nr_stripes != nr_stripes) {
+		r = raid5_set_cache_size(mddev, nr_stripes);
+		if (r) {
+			rs->ti->error = "Failed to set raid4/5/6 stripe cache size";
+			return r;
+		}
+
+		DMINFO("%u stripe cache entries", nr_stripes);
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
+/* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
+static unsigned int mddev_data_stripes(struct raid_set *rs)
+{
+	return rs->md.raid_disks - rs->raid_type->parity_devs;
+}
+
+/* Return # of data stripes of @rs (i.e. as of ctr) */
+static unsigned int rs_data_stripes(struct raid_set *rs)
+{
+	return rs->raid_disks - rs->raid_type->parity_devs;
+}
+
+/* Calculate the sectors per device and per array used for @rs */
+static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
+{
+	int delta_disks;
+	unsigned int data_stripes;
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+	sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
+
+	if (use_mddev) {
+		delta_disks = mddev->delta_disks;
+		data_stripes = mddev_data_stripes(rs);
+	} else {
+		delta_disks = rs->delta_disks;
+		data_stripes = rs_data_stripes(rs);
+	}
+
+	/* Special raid1 case w/o delta_disks support (yet) */
+	if (rt_is_raid1(rs->raid_type))
+		;
+	else if (rt_is_raid10(rs->raid_type)) {
+		if (rs->raid10_copies < 2 ||
+		    delta_disks < 0) {
+			rs->ti->error = "Bogus raid10 data copies or delta disks";
+			return -EINVAL;
+		}
+
+		dev_sectors *= rs->raid10_copies;
+		if (sector_div(dev_sectors, data_stripes))
+			goto bad;
+
+		array_sectors = (data_stripes + delta_disks) * dev_sectors;
+		if (sector_div(array_sectors, rs->raid10_copies))
+			goto bad;
+
+	} else if (sector_div(dev_sectors, data_stripes))
+		goto bad;
+
+	else
+		/* Striped layouts */
+		array_sectors = (data_stripes + delta_disks) * dev_sectors;
+
+	rdev_for_each(rdev, mddev)
+		rdev->sectors = dev_sectors;
+
+	mddev->array_sectors = array_sectors;
+	mddev->dev_sectors = dev_sectors;
+
+	return 0;
+bad:
+	rs->ti->error = "Target length not divisible by number of data devices";
+	return -EINVAL;
+}
+
+/* Setup recovery on @rs */
+static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
+{
+	/* raid0 does not recover */
+	if (rs_is_raid0(rs))
+		rs->md.recovery_cp = MaxSector;
+	/*
+	 * A raid6 set has to be recovered either
+	 * completely or for the grown part to
+	 * ensure proper parity and Q-Syndrome
+	 */
+	else if (rs_is_raid6(rs))
+		rs->md.recovery_cp = dev_sectors;
+	/*
+	 * Other raid set types may skip recovery
+	 * depending on the 'nosync' flag.
+	 */
+	else
+		rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
+				     ? MaxSector : dev_sectors;
+}
+
+/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */
+static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
+{
+	if (!dev_sectors)
+		/* New raid set or 'sync' flag provided */
+		__rs_setup_recovery(rs, 0);
+	else if (dev_sectors == MaxSector)
+		/* Prevent recovery */
+		__rs_setup_recovery(rs, MaxSector);
+	else if (rs->dev[0].rdev.sectors < dev_sectors)
+		/* Grown raid set */
+		__rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
+	else
+		__rs_setup_recovery(rs, MaxSector);
+}
+
 static void do_table_event(struct work_struct *ws)
 static void do_table_event(struct work_struct *ws)
 {
 {
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
 
 
+	smp_rmb(); /* Make sure we access most actual mddev properties */
+	if (!rs_is_reshaping(rs))
+		rs_set_capacity(rs);
 	dm_table_event(rs->ti->table);
 	dm_table_event(rs->ti->table);
 }
 }
 
 
@@ -748,6 +1532,211 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return mddev_congested(&rs->md, bits);
 	return mddev_congested(&rs->md, bits);
 }
 }
 
 
+/*
+ * Make sure a valid takover (level switch) is being requested on @rs
+ *
+ * Conversions of raid sets from one MD personality to another
+ * have to conform to restrictions which are enforced here.
+ */
+static int rs_check_takeover(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	unsigned int near_copies;
+
+	if (rs->md.degraded) {
+		rs->ti->error = "Can't takeover degraded raid set";
+		return -EPERM;
+	}
+
+	if (rs_is_reshaping(rs)) {
+		rs->ti->error = "Can't takeover reshaping raid set";
+		return -EPERM;
+	}
+
+	switch (mddev->level) {
+	case 0:
+		/* raid0 -> raid1/5 with one disk */
+		if ((mddev->new_level == 1 || mddev->new_level == 5) &&
+		    mddev->raid_disks == 1)
+			return 0;
+
+		/* raid0 -> raid10 */
+		if (mddev->new_level == 10 &&
+		    !(rs->raid_disks % mddev->raid_disks))
+			return 0;
+
+		/* raid0 with multiple disks -> raid4/5/6 */
+		if (__within_range(mddev->new_level, 4, 6) &&
+		    mddev->new_layout == ALGORITHM_PARITY_N &&
+		    mddev->raid_disks > 1)
+			return 0;
+
+		break;
+
+	case 10:
+		/* Can't takeover raid10_offset! */
+		if (__is_raid10_offset(mddev->layout))
+			break;
+
+		near_copies = __raid10_near_copies(mddev->layout);
+
+		/* raid10* -> raid0 */
+		if (mddev->new_level == 0) {
+			/* Can takeover raid10_near with raid disks divisable by data copies! */
+			if (near_copies > 1 &&
+			    !(mddev->raid_disks % near_copies)) {
+				mddev->raid_disks /= near_copies;
+				mddev->delta_disks = mddev->raid_disks;
+				return 0;
+			}
+
+			/* Can takeover raid10_far */
+			if (near_copies == 1 &&
+			    __raid10_far_copies(mddev->layout) > 1)
+				return 0;
+
+			break;
+		}
+
+		/* raid10_{near,far} -> raid1 */
+		if (mddev->new_level == 1 &&
+		    max(near_copies, __raid10_far_copies(mddev->layout)) == mddev->raid_disks)
+			return 0;
+
+		/* raid10_{near,far} with 2 disks -> raid4/5 */
+		if (__within_range(mddev->new_level, 4, 5) &&
+		    mddev->raid_disks == 2)
+			return 0;
+		break;
+
+	case 1:
+		/* raid1 with 2 disks -> raid4/5 */
+		if (__within_range(mddev->new_level, 4, 5) &&
+		    mddev->raid_disks == 2) {
+			mddev->degraded = 1;
+			return 0;
+		}
+
+		/* raid1 -> raid0 */
+		if (mddev->new_level == 0 &&
+		    mddev->raid_disks == 1)
+			return 0;
+
+		/* raid1 -> raid10 */
+		if (mddev->new_level == 10)
+			return 0;
+		break;
+
+	case 4:
+		/* raid4 -> raid0 */
+		if (mddev->new_level == 0)
+			return 0;
+
+		/* raid4 -> raid1/5 with 2 disks */
+		if ((mddev->new_level == 1 || mddev->new_level == 5) &&
+		    mddev->raid_disks == 2)
+			return 0;
+
+		/* raid4 -> raid5/6 with parity N */
+		if (__within_range(mddev->new_level, 5, 6) &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+		break;
+
+	case 5:
+		/* raid5 with parity N -> raid0 */
+		if (mddev->new_level == 0 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid5 with parity N -> raid4 */
+		if (mddev->new_level == 4 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid5 with 2 disks -> raid1/4/10 */
+		if ((mddev->new_level == 1 || mddev->new_level == 4 || mddev->new_level == 10) &&
+		    mddev->raid_disks == 2)
+			return 0;
+
+		/* raid5_* ->  raid6_*_6 with Q-Syndrome N (e.g. raid5_ra -> raid6_ra_6 */
+		if (mddev->new_level == 6 &&
+		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
+		      __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
+			return 0;
+		break;
+
+	case 6:
+		/* raid6 with parity N -> raid0 */
+		if (mddev->new_level == 0 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid6 with parity N -> raid4 */
+		if (mddev->new_level == 4 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid6_*_n with Q-Syndrome N -> raid5_* */
+		if (mddev->new_level == 5 &&
+		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
+		     __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
+			return 0;
+
+	default:
+		break;
+	}
+
+	rs->ti->error = "takeover not possible";
+	return -EINVAL;
+}
+
+/* True if @rs requested to be taken over */
+static bool rs_takeover_requested(struct raid_set *rs)
+{
+	return rs->md.new_level != rs->md.level;
+}
+
+/* True if @rs is requested to reshape by ctr */
+static bool rs_reshape_requested(struct raid_set *rs)
+{
+	bool change;
+	struct mddev *mddev = &rs->md;
+
+	if (rs_takeover_requested(rs))
+		return false;
+
+	if (!mddev->level)
+		return false;
+
+	change = mddev->new_layout != mddev->layout ||
+		 mddev->new_chunk_sectors != mddev->chunk_sectors ||
+		 rs->delta_disks;
+
+	/* Historical case to support raid1 reshape without delta disks */
+	if (mddev->level == 1) {
+		if (rs->delta_disks)
+			return !!rs->delta_disks;
+
+		return !change &&
+		       mddev->raid_disks != rs->raid_disks;
+	}
+
+	if (mddev->level == 10)
+		return change &&
+		       !__is_raid10_far(mddev->new_layout) &&
+		       rs->delta_disks >= 0;
+
+	return change;
+}
+
+/*  Features */
+#define	FEATURE_FLAG_SUPPORTS_V190	0x1 /* Supports extended superblock */
+
+/* State flags for sb->flags */
+#define	SB_FLAG_RESHAPE_ACTIVE		0x1
+#define	SB_FLAG_RESHAPE_BACKWARDS	0x2
+
 /*
 /*
  * This structure is never routinely used by userspace, unlike md superblocks.
  * This structure is never routinely used by userspace, unlike md superblocks.
  * Devices with this superblock should only ever be accessed via device-mapper.
  * Devices with this superblock should only ever be accessed via device-mapper.
@@ -755,13 +1744,14 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 #define DM_RAID_MAGIC 0x64526D44
 #define DM_RAID_MAGIC 0x64526D44
 struct dm_raid_superblock {
 struct dm_raid_superblock {
 	__le32 magic;		/* "DmRd" */
 	__le32 magic;		/* "DmRd" */
-	__le32 features;	/* Used to indicate possible future changes */
+	__le32 compat_features;	/* Used to indicate compatible features (like 1.9.0 ondisk metadata extension) */
 
 
-	__le32 num_devices;	/* Number of devices in this array. (Max 64) */
-	__le32 array_position;	/* The position of this drive in the array */
+	__le32 num_devices;	/* Number of devices in this raid set. (Max 64) */
+	__le32 array_position;	/* The position of this drive in the raid set */
 
 
 	__le64 events;		/* Incremented by md when superblock updated */
 	__le64 events;		/* Incremented by md when superblock updated */
-	__le64 failed_devices;	/* Bit field of devices to indicate failures */
+	__le64 failed_devices;	/* Pre 1.9.0 part of bit field of devices to */
+				/* indicate failures (see extension below) */
 
 
 	/*
 	/*
 	 * This offset tracks the progress of the repair or replacement of
 	 * This offset tracks the progress of the repair or replacement of
@@ -770,21 +1760,95 @@ struct dm_raid_superblock {
 	__le64 disk_recovery_offset;
 	__le64 disk_recovery_offset;
 
 
 	/*
 	/*
-	 * This offset tracks the progress of the initial array
+	 * This offset tracks the progress of the initial raid set
 	 * synchronisation/parity calculation.
 	 * synchronisation/parity calculation.
 	 */
 	 */
 	__le64 array_resync_offset;
 	__le64 array_resync_offset;
 
 
 	/*
 	/*
-	 * RAID characteristics
+	 * raid characteristics
 	 */
 	 */
 	__le32 level;
 	__le32 level;
 	__le32 layout;
 	__le32 layout;
 	__le32 stripe_sectors;
 	__le32 stripe_sectors;
 
 
-	/* Remainder of a logical block is zero-filled when writing (see super_sync()). */
+	/********************************************************************
+	 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+	 *
+	 * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
+	 */
+
+	__le32 flags; /* Flags defining array states for reshaping */
+
+	/*
+	 * This offset tracks the progress of a raid
+	 * set reshape in order to be able to restart it
+	 */
+	__le64 reshape_position;
+
+	/*
+	 * These define the properties of the array in case of an interrupted reshape
+	 */
+	__le32 new_level;
+	__le32 new_layout;
+	__le32 new_stripe_sectors;
+	__le32 delta_disks;
+
+	__le64 array_sectors; /* Array size in sectors */
+
+	/*
+	 * Sector offsets to data on devices (reshaping).
+	 * Needed to support out of place reshaping, thus
+	 * not writing over any stripes whilst converting
+	 * them from old to new layout
+	 */
+	__le64 data_offset;
+	__le64 new_data_offset;
+
+	__le64 sectors; /* Used device size in sectors */
+
+	/*
+	 * Additonal Bit field of devices indicating failures to support
+	 * up to 256 devices with the 1.9.0 on-disk metadata format
+	 */
+	__le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1];
+
+	__le32 incompat_features;	/* Used to indicate any incompatible features */
+
+	/* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */
 } __packed;
 } __packed;
 
 
+/*
+ * Check for reshape constraints on raid set @rs:
+ *
+ * - reshape function non-existent
+ * - degraded set
+ * - ongoing recovery
+ * - ongoing reshape
+ *
+ * Returns 0 if none or -EPERM if given constraint
+ * and error message reference in @errmsg
+ */
+static int rs_check_reshape(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	if (!mddev->pers || !mddev->pers->check_reshape)
+		rs->ti->error = "Reshape not supported";
+	else if (mddev->degraded)
+		rs->ti->error = "Can't reshape degraded raid set";
+	else if (rs_is_recovering(rs))
+		rs->ti->error = "Convert request on recovering raid set prohibited";
+	else if (rs_is_reshaping(rs))
+		rs->ti->error = "raid set already reshaping!";
+	else if (!(rs_is_raid1(rs) || rs_is_raid10(rs) || rs_is_raid456(rs)))
+		rs->ti->error = "Reshaping only supported for raid1/4/5/6/10";
+	else
+		return 0;
+
+	return -EPERM;
+}
+
 static int read_disk_sb(struct md_rdev *rdev, int size)
 static int read_disk_sb(struct md_rdev *rdev, int size)
 {
 {
 	BUG_ON(!rdev->sb_page);
 	BUG_ON(!rdev->sb_page);
@@ -792,7 +1856,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 	if (rdev->sb_loaded)
 	if (rdev->sb_loaded)
 		return 0;
 		return 0;
 
 
-	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, 1)) {
+	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
 		DMERR("Failed to read superblock of device at position %d",
 		DMERR("Failed to read superblock of device at position %d",
 		      rdev->raid_disk);
 		      rdev->raid_disk);
 		md_error(rdev->mddev, rdev);
 		md_error(rdev->mddev, rdev);
@@ -804,31 +1868,67 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 	return 0;
 	return 0;
 }
 }
 
 
+static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices)
+{
+	failed_devices[0] = le64_to_cpu(sb->failed_devices);
+	memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
+
+	if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
+		int i = ARRAY_SIZE(sb->extended_failed_devices);
+
+		while (i--)
+			failed_devices[i+1] = le64_to_cpu(sb->extended_failed_devices[i]);
+	}
+}
+
+static void sb_update_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices)
+{
+	int i = ARRAY_SIZE(sb->extended_failed_devices);
+
+	sb->failed_devices = cpu_to_le64(failed_devices[0]);
+	while (i--)
+		sb->extended_failed_devices[i] = cpu_to_le64(failed_devices[i+1]);
+}
+
+/*
+ * Synchronize the superblock members with the raid set properties
+ *
+ * All superblock data is little endian.
+ */
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
 {
-	int i;
-	uint64_t failed_devices;
+	bool update_failed_devices = false;
+	unsigned int i;
+	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
 	struct dm_raid_superblock *sb;
 	struct dm_raid_superblock *sb;
 	struct raid_set *rs = container_of(mddev, struct raid_set, md);
 	struct raid_set *rs = container_of(mddev, struct raid_set, md);
 
 
+	/* No metadata device, no superblock */
+	if (!rdev->meta_bdev)
+		return;
+
+	BUG_ON(!rdev->sb_page);
+
 	sb = page_address(rdev->sb_page);
 	sb = page_address(rdev->sb_page);
-	failed_devices = le64_to_cpu(sb->failed_devices);
 
 
-	for (i = 0; i < mddev->raid_disks; i++)
-		if (!rs->dev[i].data_dev ||
-		    test_bit(Faulty, &(rs->dev[i].rdev.flags)))
-			failed_devices |= (1ULL << i);
+	sb_retrieve_failed_devices(sb, failed_devices);
 
 
-	memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
+	for (i = 0; i < rs->raid_disks; i++)
+		if (!rs->dev[i].data_dev || test_bit(Faulty, &rs->dev[i].rdev.flags)) {
+			update_failed_devices = true;
+			set_bit(i, (void *) failed_devices);
+		}
+
+	if (update_failed_devices)
+		sb_update_failed_devices(sb, failed_devices);
 
 
 	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
 	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
-	sb->features = cpu_to_le32(0);	/* No features yet */
+	sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 
 
 	sb->num_devices = cpu_to_le32(mddev->raid_disks);
 	sb->num_devices = cpu_to_le32(mddev->raid_disks);
 	sb->array_position = cpu_to_le32(rdev->raid_disk);
 	sb->array_position = cpu_to_le32(rdev->raid_disk);
 
 
 	sb->events = cpu_to_le64(mddev->events);
 	sb->events = cpu_to_le64(mddev->events);
-	sb->failed_devices = cpu_to_le64(failed_devices);
 
 
 	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
 	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
 	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
 	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
@@ -836,6 +1936,33 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->level = cpu_to_le32(mddev->level);
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 	sb->layout = cpu_to_le32(mddev->layout);
 	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
 	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+
+	sb->new_level = cpu_to_le32(mddev->new_level);
+	sb->new_layout = cpu_to_le32(mddev->new_layout);
+	sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
+
+	sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+
+	smp_rmb(); /* Make sure we access most recent reshape position */
+	sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+	if (le64_to_cpu(sb->reshape_position) != MaxSector) {
+		/* Flag ongoing reshape */
+		sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE);
+
+		if (mddev->delta_disks < 0 || mddev->reshape_backwards)
+			sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS);
+	} else {
+		/* Clear reshape flags */
+		sb->flags &= ~(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS));
+	}
+
+	sb->array_sectors = cpu_to_le64(mddev->array_sectors);
+	sb->data_offset = cpu_to_le64(rdev->data_offset);
+	sb->new_data_offset = cpu_to_le64(rdev->new_data_offset);
+	sb->sectors = cpu_to_le64(rdev->sectors);
+
+	/* Zero out the rest of the payload after the size of the superblock */
+	memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
 }
 }
 
 
 /*
 /*
@@ -848,7 +1975,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
  */
  */
 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 {
 {
-	int ret;
+	int r;
 	struct dm_raid_superblock *sb;
 	struct dm_raid_superblock *sb;
 	struct dm_raid_superblock *refsb;
 	struct dm_raid_superblock *refsb;
 	uint64_t events_sb, events_refsb;
 	uint64_t events_sb, events_refsb;
@@ -860,9 +1987,9 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	ret = read_disk_sb(rdev, rdev->sb_size);
-	if (ret)
-		return ret;
+	r = read_disk_sb(rdev, rdev->sb_size);
+	if (r)
+		return r;
 
 
 	sb = page_address(rdev->sb_page);
 	sb = page_address(rdev->sb_page);
 
 
@@ -876,6 +2003,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		super_sync(rdev->mddev, rdev);
 		super_sync(rdev->mddev, rdev);
 
 
 		set_bit(FirstUse, &rdev->flags);
 		set_bit(FirstUse, &rdev->flags);
+		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 
 
 		/* Force writing of superblocks to disk */
 		/* Force writing of superblocks to disk */
 		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
 		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
@@ -895,129 +2023,212 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 	return (events_sb > events_refsb) ? 1 : 0;
 	return (events_sb > events_refsb) ? 1 : 0;
 }
 }
 
 
-static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
+static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 {
 {
 	int role;
 	int role;
-	struct raid_set *rs = container_of(mddev, struct raid_set, md);
+	unsigned int d;
+	struct mddev *mddev = &rs->md;
 	uint64_t events_sb;
 	uint64_t events_sb;
-	uint64_t failed_devices;
+	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
 	struct dm_raid_superblock *sb;
 	struct dm_raid_superblock *sb;
-	uint32_t new_devs = 0;
-	uint32_t rebuilds = 0;
+	uint32_t new_devs = 0, rebuild_and_new = 0, rebuilds = 0;
 	struct md_rdev *r;
 	struct md_rdev *r;
 	struct dm_raid_superblock *sb2;
 	struct dm_raid_superblock *sb2;
 
 
 	sb = page_address(rdev->sb_page);
 	sb = page_address(rdev->sb_page);
 	events_sb = le64_to_cpu(sb->events);
 	events_sb = le64_to_cpu(sb->events);
-	failed_devices = le64_to_cpu(sb->failed_devices);
 
 
 	/*
 	/*
 	 * Initialise to 1 if this is a new superblock.
 	 * Initialise to 1 if this is a new superblock.
 	 */
 	 */
 	mddev->events = events_sb ? : 1;
 	mddev->events = events_sb ? : 1;
 
 
+	mddev->reshape_position = MaxSector;
+
 	/*
 	/*
-	 * Reshaping is not currently allowed
+	 * Reshaping is supported, e.g. reshape_position is valid
+	 * in superblock and superblock content is authoritative.
 	 */
 	 */
-	if (le32_to_cpu(sb->level) != mddev->level) {
-		DMERR("Reshaping arrays not yet supported. (RAID level change)");
-		return -EINVAL;
-	}
-	if (le32_to_cpu(sb->layout) != mddev->layout) {
-		DMERR("Reshaping arrays not yet supported. (RAID layout change)");
-		DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
-		DMERR("  Old layout: %s w/ %d copies",
-		      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
-		      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
-		DMERR("  New layout: %s w/ %d copies",
-		      raid10_md_layout_to_format(mddev->layout),
-		      raid10_md_layout_to_copies(mddev->layout));
-		return -EINVAL;
-	}
-	if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
-		DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
-		return -EINVAL;
-	}
+	if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
+		/* Superblock is authoritative wrt given raid set layout! */
+		mddev->raid_disks = le32_to_cpu(sb->num_devices);
+		mddev->level = le32_to_cpu(sb->level);
+		mddev->layout = le32_to_cpu(sb->layout);
+		mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
+		mddev->new_level = le32_to_cpu(sb->new_level);
+		mddev->new_layout = le32_to_cpu(sb->new_layout);
+		mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors);
+		mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+		mddev->array_sectors = le64_to_cpu(sb->array_sectors);
+
+		/* raid was reshaping and got interrupted */
+		if (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_ACTIVE) {
+			if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
+				DMERR("Reshape requested but raid set is still reshaping");
+				return -EINVAL;
+			}
 
 
-	/* We can only change the number of devices in RAID1 right now */
-	if ((rs->raid_type->level != 1) &&
-	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
-		DMERR("Reshaping arrays not yet supported. (device count change)");
-		return -EINVAL;
+			if (mddev->delta_disks < 0 ||
+			    (!mddev->delta_disks && (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_BACKWARDS)))
+				mddev->reshape_backwards = 1;
+			else
+				mddev->reshape_backwards = 0;
+
+			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+			rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout);
+		}
+
+	} else {
+		/*
+		 * No takeover/reshaping, because we don't have the extended v1.9.0 metadata
+		 */
+		if (le32_to_cpu(sb->level) != mddev->level) {
+			DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
+			return -EINVAL;
+		}
+		if (le32_to_cpu(sb->layout) != mddev->layout) {
+			DMERR("Reshaping raid sets not yet supported. (raid layout change)");
+			DMERR("	 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+			DMERR("	 Old layout: %s w/ %d copies",
+			      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+			      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+			DMERR("	 New layout: %s w/ %d copies",
+			      raid10_md_layout_to_format(mddev->layout),
+			      raid10_md_layout_to_copies(mddev->layout));
+			return -EINVAL;
+		}
+		if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+			DMERR("Reshaping raid sets not yet supported. (stripe sectors change)");
+			return -EINVAL;
+		}
+
+		/* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */
+		if (!rt_is_raid1(rs->raid_type) &&
+		    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+			DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)",
+			      sb->num_devices, mddev->raid_disks);
+			return -EINVAL;
+		}
+
+		/* Table line is checked vs. authoritative superblock */
+		rs_set_new(rs);
 	}
 	}
 
 
-	if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
+	if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
 
 	/*
 	/*
 	 * During load, we set FirstUse if a new superblock was written.
 	 * During load, we set FirstUse if a new superblock was written.
 	 * There are two reasons we might not have a superblock:
 	 * There are two reasons we might not have a superblock:
-	 * 1) The array is brand new - in which case, all of the
-	 *    devices must have their In_sync bit set.  Also,
+	 * 1) The raid set is brand new - in which case, all of the
+	 *    devices must have their In_sync bit set.	Also,
 	 *    recovery_cp must be 0, unless forced.
 	 *    recovery_cp must be 0, unless forced.
-	 * 2) This is a new device being added to an old array
+	 * 2) This is a new device being added to an old raid set
 	 *    and the new device needs to be rebuilt - in which
 	 *    and the new device needs to be rebuilt - in which
 	 *    case the In_sync bit will /not/ be set and
 	 *    case the In_sync bit will /not/ be set and
 	 *    recovery_cp must be MaxSector.
 	 *    recovery_cp must be MaxSector.
+	 * 3) This is/are a new device(s) being added to an old
+	 *    raid set during takeover to a higher raid level
+	 *    to provide capacity for redundancy or during reshape
+	 *    to add capacity to grow the raid set.
 	 */
 	 */
+	d = 0;
 	rdev_for_each(r, mddev) {
 	rdev_for_each(r, mddev) {
+		if (test_bit(FirstUse, &r->flags))
+			new_devs++;
+
 		if (!test_bit(In_sync, &r->flags)) {
 		if (!test_bit(In_sync, &r->flags)) {
-			DMINFO("Device %d specified for rebuild: "
-			       "Clearing superblock", r->raid_disk);
+			DMINFO("Device %d specified for rebuild; clearing superblock",
+				r->raid_disk);
 			rebuilds++;
 			rebuilds++;
-		} else if (test_bit(FirstUse, &r->flags))
-			new_devs++;
+
+			if (test_bit(FirstUse, &r->flags))
+				rebuild_and_new++;
+		}
+
+		d++;
 	}
 	}
 
 
-	if (!rebuilds) {
-		if (new_devs == mddev->raid_disks) {
-			DMINFO("Superblocks created for new array");
+	if (new_devs == rs->raid_disks || !rebuilds) {
+		/* Replace a broken device */
+		if (new_devs == 1 && !rs->delta_disks)
+			;
+		if (new_devs == rs->raid_disks) {
+			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
-		} else if (new_devs) {
-			DMERR("New device injected "
-			      "into existing array without 'rebuild' "
-			      "parameter specified");
+		} else if (new_devs != rebuilds &&
+			   new_devs != rs->delta_disks) {
+			DMERR("New device injected into existing raid set without "
+			      "'delta_disks' or 'rebuild' parameter specified");
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
-	} else if (new_devs) {
-		DMERR("'rebuild' devices cannot be "
-		      "injected into an array with other first-time devices");
-		return -EINVAL;
-	} else if (mddev->recovery_cp != MaxSector) {
-		DMERR("'rebuild' specified while array is not in-sync");
+	} else if (new_devs && new_devs != rebuilds) {
+		DMERR("%u 'rebuild' devices cannot be injected into"
+		      " a raid set with %u other first-time devices",
+		      rebuilds, new_devs);
 		return -EINVAL;
 		return -EINVAL;
+	} else if (rebuilds) {
+		if (rebuild_and_new && rebuilds != rebuild_and_new) {
+			DMERR("new device%s provided without 'rebuild'",
+			      new_devs > 1 ? "s" : "");
+			return -EINVAL;
+		} else if (rs_is_recovering(rs)) {
+			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
+			      (unsigned long long) mddev->recovery_cp);
+			return -EINVAL;
+		} else if (rs_is_reshaping(rs)) {
+			DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
+			      (unsigned long long) mddev->reshape_position);
+			return -EINVAL;
+		}
 	}
 	}
 
 
 	/*
 	/*
 	 * Now we set the Faulty bit for those devices that are
 	 * Now we set the Faulty bit for those devices that are
 	 * recorded in the superblock as failed.
 	 * recorded in the superblock as failed.
 	 */
 	 */
+	sb_retrieve_failed_devices(sb, failed_devices);
 	rdev_for_each(r, mddev) {
 	rdev_for_each(r, mddev) {
 		if (!r->sb_page)
 		if (!r->sb_page)
 			continue;
 			continue;
 		sb2 = page_address(r->sb_page);
 		sb2 = page_address(r->sb_page);
 		sb2->failed_devices = 0;
 		sb2->failed_devices = 0;
+		memset(sb2->extended_failed_devices, 0, sizeof(sb2->extended_failed_devices));
 
 
 		/*
 		/*
 		 * Check for any device re-ordering.
 		 * Check for any device re-ordering.
 		 */
 		 */
 		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
 		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
 			role = le32_to_cpu(sb2->array_position);
 			role = le32_to_cpu(sb2->array_position);
+			if (role < 0)
+				continue;
+
 			if (role != r->raid_disk) {
 			if (role != r->raid_disk) {
-				if (rs->raid_type->level != 1) {
-					rs->ti->error = "Cannot change device "
-						"positions in RAID array";
+				if (__is_raid10_near(mddev->layout)) {
+					if (mddev->raid_disks % __raid10_near_copies(mddev->layout) ||
+					    rs->raid_disks % rs->raid10_copies) {
+						rs->ti->error =
+							"Cannot change raid10 near set to odd # of devices!";
+						return -EINVAL;
+					}
+
+					sb2->array_position = cpu_to_le32(r->raid_disk);
+
+				} else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) &&
+					   !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) &&
+					   !rt_is_raid1(rs->raid_type)) {
+					rs->ti->error = "Cannot change device positions in raid set";
 					return -EINVAL;
 					return -EINVAL;
 				}
 				}
-				DMINFO("RAID1 device #%d now at position #%d",
-				       role, r->raid_disk);
+
+				DMINFO("raid device #%d now at position #%d", role, r->raid_disk);
 			}
 			}
 
 
 			/*
 			/*
 			 * Partial recovery is performed on
 			 * Partial recovery is performed on
 			 * returning failed devices.
 			 * returning failed devices.
 			 */
 			 */
-			if (failed_devices & (1 << role))
+			if (test_bit(role, (void *) failed_devices))
 				set_bit(Faulty, &r->flags);
 				set_bit(Faulty, &r->flags);
 		}
 		}
 	}
 	}
@@ -1028,41 +2239,60 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
 {
 	struct mddev *mddev = &rs->md;
 	struct mddev *mddev = &rs->md;
-	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+	struct dm_raid_superblock *sb;
+
+	if (rs_is_raid0(rs) || !rdev->sb_page)
+		return 0;
+
+	sb = page_address(rdev->sb_page);
 
 
 	/*
 	/*
 	 * If mddev->events is not set, we know we have not yet initialized
 	 * If mddev->events is not set, we know we have not yet initialized
 	 * the array.
 	 * the array.
 	 */
 	 */
-	if (!mddev->events && super_init_validation(mddev, rdev))
+	if (!mddev->events && super_init_validation(rs, rdev))
+		return -EINVAL;
+
+	if (le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V190) {
+		rs->ti->error = "Unable to assemble array: Unknown flag(s) in compatible feature flags";
 		return -EINVAL;
 		return -EINVAL;
+	}
 
 
-	if (le32_to_cpu(sb->features)) {
-		rs->ti->error = "Unable to assemble array: No feature flags supported yet";
+	if (sb->incompat_features) {
+		rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
 	/* Enable bitmap creation for RAID levels != 0 */
 	/* Enable bitmap creation for RAID levels != 0 */
-	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+	mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
 	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
 	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
 
 
-	if (!test_bit(FirstUse, &rdev->flags)) {
+	if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
+		/* Retrieve device size stored in superblock to be prepared for shrink */
+		rdev->sectors = le64_to_cpu(sb->sectors);
 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
-		if (rdev->recovery_offset != MaxSector)
-			clear_bit(In_sync, &rdev->flags);
+		if (rdev->recovery_offset == MaxSector)
+			set_bit(In_sync, &rdev->flags);
+		/*
+		 * If no reshape in progress -> we're recovering single
+		 * disk(s) and have to set the device(s) to out-of-sync
+		 */
+		else if (!rs_is_reshaping(rs))
+			clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */
 	}
 	}
 
 
 	/*
 	/*
 	 * If a device comes back, set it as not In_sync and no longer faulty.
 	 * If a device comes back, set it as not In_sync and no longer faulty.
 	 */
 	 */
-	if (test_bit(Faulty, &rdev->flags)) {
-		clear_bit(Faulty, &rdev->flags);
+	if (test_and_clear_bit(Faulty, &rdev->flags)) {
+		rdev->recovery_offset = 0;
 		clear_bit(In_sync, &rdev->flags);
 		clear_bit(In_sync, &rdev->flags);
 		rdev->saved_raid_disk = rdev->raid_disk;
 		rdev->saved_raid_disk = rdev->raid_disk;
-		rdev->recovery_offset = 0;
 	}
 	}
 
 
-	clear_bit(FirstUse, &rdev->flags);
+	/* Reshape support -> restore repective data offsets */
+	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	rdev->new_data_offset = le64_to_cpu(sb->new_data_offset);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1072,7 +2302,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
  */
  */
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
 {
-	int ret;
+	int r;
 	struct raid_dev *dev;
 	struct raid_dev *dev;
 	struct md_rdev *rdev, *tmp, *freshest;
 	struct md_rdev *rdev, *tmp, *freshest;
 	struct mddev *mddev = &rs->md;
 	struct mddev *mddev = &rs->md;
@@ -1082,24 +2312,22 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		/*
 		/*
 		 * Skipping super_load due to CTR_FLAG_SYNC will cause
 		 * Skipping super_load due to CTR_FLAG_SYNC will cause
 		 * the array to undergo initialization again as
 		 * the array to undergo initialization again as
-		 * though it were new.  This is the intended effect
+		 * though it were new.	This is the intended effect
 		 * of the "sync" directive.
 		 * of the "sync" directive.
 		 *
 		 *
 		 * When reshaping capability is added, we must ensure
 		 * When reshaping capability is added, we must ensure
 		 * that the "sync" directive is disallowed during the
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 * reshape.
 		 */
 		 */
-		rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
-
-		if (rs->ctr_flags & CTR_FLAG_SYNC)
+		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
 			continue;
 			continue;
 
 
 		if (!rdev->meta_bdev)
 		if (!rdev->meta_bdev)
 			continue;
 			continue;
 
 
-		ret = super_load(rdev, freshest);
+		r = super_load(rdev, freshest);
 
 
-		switch (ret) {
+		switch (r) {
 		case 1:
 		case 1:
 			freshest = rdev;
 			freshest = rdev;
 			break;
 			break;
@@ -1116,57 +2344,368 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 			if (rdev->sb_page)
 			if (rdev->sb_page)
 				put_page(rdev->sb_page);
 				put_page(rdev->sb_page);
 
 
-			rdev->sb_page = NULL;
+			rdev->sb_page = NULL;
+
+			rdev->sb_loaded = 0;
+
+			/*
+			 * We might be able to salvage the data device
+			 * even though the meta device has failed.  For
+			 * now, we behave as though '- -' had been
+			 * set for this device in the table.
+			 */
+			if (dev->data_dev)
+				dm_put_device(ti, dev->data_dev);
+
+			dev->data_dev = NULL;
+			rdev->bdev = NULL;
+
+			list_del(&rdev->same_set);
+		}
+	}
+
+	if (!freshest)
+		return 0;
+
+	if (validate_raid_redundancy(rs)) {
+		rs->ti->error = "Insufficient redundancy to activate array";
+		return -EINVAL;
+	}
+
+	/*
+	 * Validation of the freshest device provides the source of
+	 * validation for the remaining devices.
+	 */
+	rs->ti->error = "Unable to assemble array: Invalid superblocks";
+	if (super_validate(rs, freshest))
+		return -EINVAL;
+
+	rdev_for_each(rdev, mddev)
+		if ((rdev != freshest) && super_validate(rs, rdev))
+			return -EINVAL;
+	return 0;
+}
+
+/*
+ * Adjust data_offset and new_data_offset on all disk members of @rs
+ * for out of place reshaping if requested by contructor
+ *
+ * We need free space at the beginning of each raid disk for forward
+ * and at the end for backward reshapes which userspace has to provide
+ * via remapping/reordering of space.
+ */
+static int rs_adjust_data_offsets(struct raid_set *rs)
+{
+	sector_t data_offset = 0, new_data_offset = 0;
+	struct md_rdev *rdev;
+
+	/* Constructor did not request data offset change */
+	if (!test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
+		if (!rs_is_reshapable(rs))
+			goto out;
+
+		return 0;
+	}
+
+	/* HM FIXME: get InSync raid_dev? */
+	rdev = &rs->dev[0].rdev;
+
+	if (rs->delta_disks < 0) {
+		/*
+		 * Removing disks (reshaping backwards):
+		 *
+		 * - before reshape: data is at offset 0 and free space
+		 *		     is at end of each component LV
+		 *
+		 * - after reshape: data is at offset rs->data_offset != 0 on each component LV
+		 */
+		data_offset = 0;
+		new_data_offset = rs->data_offset;
+
+	} else if (rs->delta_disks > 0) {
+		/*
+		 * Adding disks (reshaping forwards):
+		 *
+		 * - before reshape: data is at offset rs->data_offset != 0 and
+		 *		     free space is at begin of each component LV
+		 *
+		 * - after reshape: data is at offset 0 on each component LV
+		 */
+		data_offset = rs->data_offset;
+		new_data_offset = 0;
+
+	} else {
+		/*
+		 * User space passes in 0 for data offset after having removed reshape space
+		 *
+		 * - or - (data offset != 0)
+		 *
+		 * Changing RAID layout or chunk size -> toggle offsets
+		 *
+		 * - before reshape: data is at offset rs->data_offset 0 and
+		 *		     free space is at end of each component LV
+		 *		     -or-
+		 *                   data is at offset rs->data_offset != 0 and
+		 *		     free space is at begin of each component LV
+		 *
+		 * - after reshape: data is at offset 0 if it was at offset != 0
+		 *                  or at offset != 0 if it was at offset 0
+		 *                  on each component LV
+		 *
+		 */
+		data_offset = rs->data_offset ? rdev->data_offset : 0;
+		new_data_offset = data_offset ? 0 : rs->data_offset;
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+	}
+
+	/*
+	 * Make sure we got a minimum amount of free sectors per device
+	 */
+	if (rs->data_offset &&
+	    to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) {
+		rs->ti->error = data_offset ? "No space for forward reshape" :
+					      "No space for backward reshape";
+		return -ENOSPC;
+	}
+out:
+	/* Adjust data offsets on all rdevs */
+	rdev_for_each(rdev, &rs->md) {
+		rdev->data_offset = data_offset;
+		rdev->new_data_offset = new_data_offset;
+	}
+
+	return 0;
+}
+
+/* Userpace reordered disks -> adjust raid_disk indexes in @rs */
+static void __reorder_raid_disk_indexes(struct raid_set *rs)
+{
+	int i = 0;
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, &rs->md) {
+		rdev->raid_disk = i++;
+		rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+	}
+}
+
+/*
+ * Setup @rs for takeover by a different raid level
+ */
+static int rs_setup_takeover(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+	unsigned int d = mddev->raid_disks = rs->raid_disks;
+	sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
+
+	if (rt_is_raid10(rs->raid_type)) {
+		if (mddev->level == 0) {
+			/* Userpace reordered disks -> adjust raid_disk indexes */
+			__reorder_raid_disk_indexes(rs);
+
+			/* raid0 -> raid10_far layout */
+			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
+								   rs->raid10_copies);
+		} else if (mddev->level == 1)
+			/* raid1 -> raid10_near layout */
+			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
+								   rs->raid_disks);
+		else
+			return -EINVAL;
+
+	}
+
+	clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+	mddev->recovery_cp = MaxSector;
+
+	while (d--) {
+		rdev = &rs->dev[d].rdev;
+
+		if (test_bit(d, (void *) rs->rebuild_disks)) {
+			clear_bit(In_sync, &rdev->flags);
+			clear_bit(Faulty, &rdev->flags);
+			mddev->recovery_cp = rdev->recovery_offset = 0;
+			/* Bitmap has to be created when we do an "up" takeover */
+			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+		}
+
+		rdev->new_data_offset = new_data_offset;
+	}
+
+	return 0;
+}
+
+/* Prepare @rs for reshape */
+static int rs_prepare_reshape(struct raid_set *rs)
+{
+	bool reshape;
+	struct mddev *mddev = &rs->md;
+
+	if (rs_is_raid10(rs)) {
+		if (rs->raid_disks != mddev->raid_disks &&
+		    __is_raid10_near(mddev->layout) &&
+		    rs->raid10_copies &&
+		    rs->raid10_copies != __raid10_near_copies(mddev->layout)) {
+			/*
+			 * raid disk have to be multiple of data copies to allow this conversion,
+			 *
+			 * This is actually not a reshape it is a
+			 * rebuild of any additional mirrors per group
+			 */
+			if (rs->raid_disks % rs->raid10_copies) {
+				rs->ti->error = "Can't reshape raid10 mirror groups";
+				return -EINVAL;
+			}
+
+			/* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
+			__reorder_raid_disk_indexes(rs);
+			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
+								   rs->raid10_copies);
+			mddev->new_layout = mddev->layout;
+			reshape = false;
+		} else
+			reshape = true;
+
+	} else if (rs_is_raid456(rs))
+		reshape = true;
+
+	else if (rs_is_raid1(rs)) {
+		if (rs->delta_disks) {
+			/* Process raid1 via delta_disks */
+			mddev->degraded = rs->delta_disks < 0 ? -rs->delta_disks : rs->delta_disks;
+			reshape = true;
+		} else {
+			/* Process raid1 without delta_disks */
+			mddev->raid_disks = rs->raid_disks;
+			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+			reshape = false;
+		}
+	} else {
+		rs->ti->error = "Called with bogus raid type";
+		return -EINVAL;
+	}
+
+	if (reshape) {
+		set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+	} else if (mddev->raid_disks < rs->raid_disks)
+		/* Create new superblocks and bitmaps, if any new disks */
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+
+	return 0;
+}
+
+/*
+ *
+ * - change raid layout
+ * - change chunk size
+ * - add disks
+ * - remove disks
+ */
+static int rs_setup_reshape(struct raid_set *rs)
+{
+	int r = 0;
+	unsigned int cur_raid_devs, d;
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+
+	mddev->delta_disks = rs->delta_disks;
+	cur_raid_devs = mddev->raid_disks;
+
+	/* Ignore impossible layout change whilst adding/removing disks */
+	if (mddev->delta_disks &&
+	    mddev->layout != mddev->new_layout) {
+		DMINFO("Ignoring invalid layout change with delta_disks=%d", rs->delta_disks);
+		mddev->new_layout = mddev->layout;
+	}
+
+	/*
+	 * Adjust array size:
+	 *
+	 * - in case of adding disks, array size has
+	 *   to grow after the disk adding reshape,
+	 *   which'll hapen in the event handler;
+	 *   reshape will happen forward, so space has to
+	 *   be available at the beginning of each disk
+	 *
+	 * - in case of removing disks, array size
+	 *   has to shrink before starting the reshape,
+	 *   which'll happen here;
+	 *   reshape will happen backward, so space has to
+	 *   be available at the end of each disk
+	 *
+	 * - data_offset and new_data_offset are
+	 *   adjusted for aforementioned out of place
+	 *   reshaping based on userspace passing in
+	 *   the "data_offset <sectors>" key/value
+	 *   pair via the constructor
+	 */
 
 
-			rdev->sb_loaded = 0;
+	/* Add disk(s) */
+	if (rs->delta_disks > 0) {
+		/* Prepare disks for check in raid4/5/6/10 {check|start}_reshape */
+		for (d = cur_raid_devs; d < rs->raid_disks; d++) {
+			rdev = &rs->dev[d].rdev;
+			clear_bit(In_sync, &rdev->flags);
 
 
 			/*
 			/*
-			 * We might be able to salvage the data device
-			 * even though the meta device has failed.  For
-			 * now, we behave as though '- -' had been
-			 * set for this device in the table.
+			 * save_raid_disk needs to be -1, or recovery_offset will be set to 0
+			 * by md, which'll store that erroneously in the superblock on reshape
 			 */
 			 */
-			if (dev->data_dev)
-				dm_put_device(ti, dev->data_dev);
-
-			dev->data_dev = NULL;
-			rdev->bdev = NULL;
+			rdev->saved_raid_disk = -1;
+			rdev->raid_disk = d;
 
 
-			list_del(&rdev->same_set);
+			rdev->sectors = mddev->dev_sectors;
+			rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
 		}
 		}
-	}
 
 
-	if (!freshest)
-		return 0;
-
-	if (validate_raid_redundancy(rs)) {
-		rs->ti->error = "Insufficient redundancy to activate array";
-		return -EINVAL;
-	}
+		mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
 
 
-	/*
-	 * Validation of the freshest device provides the source of
-	 * validation for the remaining devices.
-	 */
-	ti->error = "Unable to assemble array: Invalid superblocks";
-	if (super_validate(rs, freshest))
-		return -EINVAL;
+	/* Remove disk(s) */
+	} else if (rs->delta_disks < 0) {
+		r = rs_set_dev_and_array_sectors(rs, true);
+		mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
 
 
-	rdev_for_each(rdev, mddev)
-		if ((rdev != freshest) && super_validate(rs, rdev))
-			return -EINVAL;
+	/* Change layout and/or chunk size */
+	} else {
+		/*
+		 * Reshape layout (e.g. raid5_ls -> raid5_n) and/or chunk size:
+		 *
+		 * keeping number of disks and do layout change ->
+		 *
+		 * toggle reshape_backward depending on data_offset:
+		 *
+		 * - free space upfront -> reshape forward
+		 *
+		 * - free space at the end -> reshape backward
+		 *
+		 *
+		 * This utilizes free reshape space avoiding the need
+		 * for userspace to move (parts of) LV segments in
+		 * case of layout/chunksize change  (for disk
+		 * adding/removing reshape space has to be at
+		 * the proper address (see above with delta_disks):
+		 *
+		 * add disk(s)   -> begin
+		 * remove disk(s)-> end
+		 */
+		mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
+	}
 
 
-	return 0;
+	return r;
 }
 }
 
 
 /*
 /*
  * Enable/disable discard support on RAID set depending on
  * Enable/disable discard support on RAID set depending on
  * RAID level and discard properties of underlying RAID members.
  * RAID level and discard properties of underlying RAID members.
  */
  */
-static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
+static void configure_discard_support(struct raid_set *rs)
 {
 {
 	int i;
 	int i;
 	bool raid456;
 	bool raid456;
+	struct dm_target *ti = rs->ti;
 
 
 	/* Assume discards not supported until after checks below. */
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
 	ti->discards_supported = false;
@@ -1174,7 +2713,7 @@ static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
 	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
 	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
 
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		struct request_queue *q;
 		struct request_queue *q;
 
 
 		if (!rs->dev[i].rdev.bdev)
 		if (!rs->dev[i].rdev.bdev)
@@ -1207,118 +2746,252 @@ static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
 }
 }
 
 
 /*
 /*
- * Construct a RAID4/5/6 mapping:
+ * Construct a RAID0/1/10/4/5/6 mapping:
  * Args:
  * Args:
- *	<raid_type> <#raid_params> <raid_params>		\
- *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *	<raid_type> <#raid_params> <raid_params>{0,}	\
+ *	<#raid_devs> [<meta_dev1> <dev1>]{1,}
  *
  *
- * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * <raid_params> varies by <raid_type>.	 See 'parse_raid_params' for
  * details on possible <raid_params>.
  * details on possible <raid_params>.
+ *
+ * Userspace is free to initialize the metadata devices, hence the superblocks to
+ * enforce recreation based on the passed in table parameters.
+ *
  */
  */
-static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 {
-	int ret;
+	int r;
+	bool resize;
 	struct raid_type *rt;
 	struct raid_type *rt;
-	unsigned long num_raid_params, num_raid_devs;
+	unsigned int num_raid_params, num_raid_devs;
+	sector_t calculated_dev_sectors;
 	struct raid_set *rs = NULL;
 	struct raid_set *rs = NULL;
-
-	/* Must have at least <raid_type> <#raid_params> */
-	if (argc < 2) {
-		ti->error = "Too few arguments";
+	const char *arg;
+	struct rs_layout rs_layout;
+	struct dm_arg_set as = { argc, argv }, as_nrd;
+	struct dm_arg _args[] = {
+		{ 0, as.argc, "Cannot understand number of raid parameters" },
+		{ 1, 254, "Cannot understand number of raid devices parameters" }
+	};
+
+	/* Must have <raid_type> */
+	arg = dm_shift_arg(&as);
+	if (!arg) {
+		ti->error = "No arguments";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	/* raid type */
-	rt = get_raid_type(argv[0]);
+	rt = get_raid_type(arg);
 	if (!rt) {
 	if (!rt) {
 		ti->error = "Unrecognised raid_type";
 		ti->error = "Unrecognised raid_type";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	argc--;
-	argv++;
 
 
-	/* number of RAID parameters */
-	if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
-		ti->error = "Cannot understand number of RAID parameters";
+	/* Must have <#raid_params> */
+	if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error))
 		return -EINVAL;
 		return -EINVAL;
-	}
-	argc--;
-	argv++;
-
-	/* Skip over RAID params for now and find out # of devices */
-	if (num_raid_params >= argc) {
-		ti->error = "Arguments do not agree with counts given";
-		return -EINVAL;
-	}
 
 
-	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
-	    (num_raid_devs > MAX_RAID_DEVICES)) {
-		ti->error = "Cannot understand number of raid devices";
+	/* number of raid device tupples <meta_dev data_dev> */
+	as_nrd = as;
+	dm_consume_args(&as_nrd, num_raid_params);
+	_args[1].max = (as_nrd.argc - 1) / 2;
+	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
 		return -EINVAL;
 		return -EINVAL;
-	}
 
 
-	argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
-	if (argc != (num_raid_devs * 2)) {
-		ti->error = "Supplied RAID devices does not match the count given";
+	if (!__within_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
+		ti->error = "Invalid number of supplied raid devices";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+	rs = raid_set_alloc(ti, rt, num_raid_devs);
 	if (IS_ERR(rs))
 	if (IS_ERR(rs))
 		return PTR_ERR(rs);
 		return PTR_ERR(rs);
 
 
-	ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
-	if (ret)
+	r = parse_raid_params(rs, &as, num_raid_params);
+	if (r)
 		goto bad;
 		goto bad;
 
 
-	argv += num_raid_params + 1;
-
-	ret = dev_parms(rs, argv);
-	if (ret)
+	r = parse_dev_params(rs, &as);
+	if (r)
 		goto bad;
 		goto bad;
 
 
 	rs->md.sync_super = super_sync;
 	rs->md.sync_super = super_sync;
-	ret = analyse_superblocks(ti, rs);
-	if (ret)
+
+	/*
+	 * Calculate ctr requested array and device sizes to allow
+	 * for superblock analysis needing device sizes defined.
+	 *
+	 * Any existing superblock will overwrite the array and device sizes
+	 */
+	r = rs_set_dev_and_array_sectors(rs, false);
+	if (r)
+		goto bad;
+
+	calculated_dev_sectors = rs->dev[0].rdev.sectors;
+
+	/*
+	 * Backup any new raid set level, layout, ...
+	 * requested to be able to compare to superblock
+	 * members for conversion decisions.
+	 */
+	rs_config_backup(rs, &rs_layout);
+
+	r = analyse_superblocks(ti, rs);
+	if (r)
 		goto bad;
 		goto bad;
 
 
+	resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
+
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
 	ti->private = rs;
 	ti->num_flush_bios = 1;
 	ti->num_flush_bios = 1;
 
 
+	/* Restore any requested new layout for conversion decision */
+	rs_config_restore(rs, &rs_layout);
+
 	/*
 	/*
-	 * Disable/enable discard support on RAID set.
+	 * Now that we have any superblock metadata available,
+	 * check for new, recovering, reshaping, to be taken over,
+	 * to be reshaped or an existing, unchanged raid set to
+	 * run in sequence.
 	 */
 	 */
-	configure_discard_support(ti, rs);
+	if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
+		/* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */
+		if (rs_is_raid6(rs) &&
+		    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+			ti->error = "'nosync' not allowed for new raid6 set";
+			r = -EINVAL;
+			goto bad;
+		}
+		rs_setup_recovery(rs, 0);
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		rs_set_new(rs);
+	} else if (rs_is_recovering(rs)) {
+		/* A recovering raid set may be resized */
+		; /* skip setup rs */
+	} else if (rs_is_reshaping(rs)) {
+		/* Have to reject size change request during reshape */
+		if (resize) {
+			ti->error = "Can't resize a reshaping raid set";
+			r = -EPERM;
+			goto bad;
+		}
+		/* skip setup rs */
+	} else if (rs_takeover_requested(rs)) {
+		if (rs_is_reshaping(rs)) {
+			ti->error = "Can't takeover a reshaping raid set";
+			r = -EPERM;
+			goto bad;
+		}
+
+		/*
+		 * If a takeover is needed, userspace sets any additional
+		 * devices to rebuild and we can check for a valid request here.
+		 *
+		 * If acceptible, set the level to the new requested
+		 * one, prohibit requesting recovery, allow the raid
+		 * set to run and store superblocks during resume.
+		 */
+		r = rs_check_takeover(rs);
+		if (r)
+			goto bad;
+
+		r = rs_setup_takeover(rs);
+		if (r)
+			goto bad;
+
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+		/* Takeover ain't recovery, so disable recovery */
+		rs_setup_recovery(rs, MaxSector);
+		rs_set_new(rs);
+	} else if (rs_reshape_requested(rs)) {
+		/*
+		  * We can only prepare for a reshape here, because the
+		  * raid set needs to run to provide the repective reshape
+		  * check functions via its MD personality instance.
+		  *
+		  * So do the reshape check after md_run() succeeded.
+		  */
+		r = rs_prepare_reshape(rs);
+		if (r)
+			return r;
+
+		/* Reshaping ain't recovery, so disable recovery */
+		rs_setup_recovery(rs, MaxSector);
+		rs_set_cur(rs);
+	} else {
+		/* May not set recovery when a device rebuild is requested */
+		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
+			rs_setup_recovery(rs, MaxSector);
+			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		} else
+			rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
+					      0 : (resize ? calculated_dev_sectors : MaxSector));
+		rs_set_cur(rs);
+	}
+
+	/* If constructor requested it, change data and new_data offsets */
+	r = rs_adjust_data_offsets(rs);
+	if (r)
+		goto bad;
+
+	/* Start raid set read-only and assumed clean to change in raid_resume() */
+	rs->md.ro = 1;
+	rs->md.in_sync = 1;
+	set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 
 
 	/* Has to be held on running the array */
 	/* Has to be held on running the array */
 	mddev_lock_nointr(&rs->md);
 	mddev_lock_nointr(&rs->md);
-	ret = md_run(&rs->md);
+	r = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
 	rs->md.in_sync = 0; /* Assume already marked dirty */
-	mddev_unlock(&rs->md);
 
 
-	if (ret) {
-		ti->error = "Fail to run raid array";
+	if (r) {
+		ti->error = "Failed to run raid array";
+		mddev_unlock(&rs->md);
 		goto bad;
 		goto bad;
 	}
 	}
 
 
-	if (ti->len != rs->md.array_sectors) {
-		ti->error = "Array size does not match requested target length";
-		ret = -EINVAL;
-		goto size_mismatch;
-	}
 	rs->callbacks.congested_fn = raid_is_congested;
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
 
 	mddev_suspend(&rs->md);
 	mddev_suspend(&rs->md);
+
+	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
+	if (rs_is_raid456(rs)) {
+		r = rs_set_raid456_stripe_cache(rs);
+		if (r)
+			goto bad_stripe_cache;
+	}
+
+	/* Now do an early reshape check */
+	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+		r = rs_check_reshape(rs);
+		if (r)
+			goto bad_check_reshape;
+
+		/* Restore new, ctr requested layout to perform check */
+		rs_config_restore(rs, &rs_layout);
+
+		if (rs->md.pers->start_reshape) {
+			r = rs->md.pers->check_reshape(&rs->md);
+			if (r) {
+				ti->error = "Reshape check failed";
+				goto bad_check_reshape;
+			}
+		}
+	}
+
+	mddev_unlock(&rs->md);
 	return 0;
 	return 0;
 
 
-size_mismatch:
+bad_stripe_cache:
+bad_check_reshape:
 	md_stop(&rs->md);
 	md_stop(&rs->md);
 bad:
 bad:
-	context_free(rs);
+	raid_set_free(rs);
 
 
-	return ret;
+	return r;
 }
 }
 
 
 static void raid_dtr(struct dm_target *ti)
 static void raid_dtr(struct dm_target *ti)
@@ -1327,7 +3000,7 @@ static void raid_dtr(struct dm_target *ti)
 
 
 	list_del_init(&rs->callbacks.list);
 	list_del_init(&rs->callbacks.list);
 	md_stop(&rs->md);
 	md_stop(&rs->md);
-	context_free(rs);
+	raid_set_free(rs);
 }
 }
 
 
 static int raid_map(struct dm_target *ti, struct bio *bio)
 static int raid_map(struct dm_target *ti, struct bio *bio)
@@ -1335,11 +3008,23 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 	struct raid_set *rs = ti->private;
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 	struct mddev *mddev = &rs->md;
 
 
+	/*
+	 * If we're reshaping to add disk(s)), ti->len and
+	 * mddev->array_sectors will differ during the process
+	 * (ti->len > mddev->array_sectors), so we have to requeue
+	 * bios with addresses > mddev->array_sectors here or
+	 * there will occur accesses past EOD of the component
+	 * data images thus erroring the raid set.
+	 */
+	if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
+		return DM_MAPIO_REQUEUE;
+
 	mddev->pers->make_request(mddev, bio);
 	mddev->pers->make_request(mddev, bio);
 
 
 	return DM_MAPIO_SUBMITTED;
 	return DM_MAPIO_SUBMITTED;
 }
 }
 
 
+/* Return string describing the current sync action of @mddev */
 static const char *decipher_sync_action(struct mddev *mddev)
 static const char *decipher_sync_action(struct mddev *mddev)
 {
 {
 	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
 	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
@@ -1365,195 +3050,260 @@ static const char *decipher_sync_action(struct mddev *mddev)
 	return "idle";
 	return "idle";
 }
 }
 
 
-static void raid_status(struct dm_target *ti, status_type_t type,
-			unsigned status_flags, char *result, unsigned maxlen)
+/*
+ * Return status string @rdev
+ *
+ * Status characters:
+ *
+ *  'D' = Dead/Failed device
+ *  'a' = Alive but not in-sync
+ *  'A' = Alive and in-sync
+ */
+static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
 {
 {
-	struct raid_set *rs = ti->private;
-	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
-	unsigned sz = 0;
-	int i, array_in_sync = 0;
-	sector_t sync;
+	if (test_bit(Faulty, &rdev->flags))
+		return "D";
+	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
+		return "a";
+	else
+		return "A";
+}
 
 
-	switch (type) {
-	case STATUSTYPE_INFO:
-		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+/* Helper to return resync/reshape progress for @rs and @array_in_sync */
+static sector_t rs_get_progress(struct raid_set *rs,
+				sector_t resync_max_sectors, bool *array_in_sync)
+{
+	sector_t r, recovery_cp, curr_resync_completed;
+	struct mddev *mddev = &rs->md;
 
 
-		if (rs->raid_type->level) {
-			if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-				sync = rs->md.curr_resync_completed;
-			else
-				sync = rs->md.recovery_cp;
-
-			if (sync >= rs->md.resync_max_sectors) {
-				/*
-				 * Sync complete.
-				 */
-				array_in_sync = 1;
-				sync = rs->md.resync_max_sectors;
-			} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-				/*
-				 * If "check" or "repair" is occurring, the array has
-				 * undergone and initial sync and the health characters
-				 * should not be 'a' anymore.
-				 */
-				array_in_sync = 1;
+	curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
+	recovery_cp = mddev->recovery_cp;
+	*array_in_sync = false;
+
+	if (rs_is_raid0(rs)) {
+		r = resync_max_sectors;
+		*array_in_sync = true;
+
+	} else {
+		r = mddev->reshape_position;
+
+		/* Reshape is relative to the array size */
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
+		    r != MaxSector) {
+			if (r == MaxSector) {
+				*array_in_sync = true;
+				r = resync_max_sectors;
 			} else {
 			} else {
-				/*
-				 * The array may be doing an initial sync, or it may
-				 * be rebuilding individual components.  If all the
-				 * devices are In_sync, then it is the array that is
-				 * being initialized.
-				 */
-				for (i = 0; i < rs->md.raid_disks; i++)
-					if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-						array_in_sync = 1;
+				/* Got to reverse on backward reshape */
+				if (mddev->reshape_backwards)
+					r = mddev->array_sectors - r;
+
+				/* Devide by # of data stripes */
+				sector_div(r, mddev_data_stripes(rs));
 			}
 			}
+
+		/* Sync is relative to the component device size */
+		} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+			r = curr_resync_completed;
+		else
+			r = recovery_cp;
+
+		if (r == MaxSector) {
+			/*
+			 * Sync complete.
+			 */
+			*array_in_sync = true;
+			r = resync_max_sectors;
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+			/*
+			 * If "check" or "repair" is occurring, the raid set has
+			 * undergone an initial sync and the health characters
+			 * should not be 'a' anymore.
+			 */
+			*array_in_sync = true;
 		} else {
 		} else {
-			/* RAID0 */
-			array_in_sync = 1;
-			sync = rs->md.resync_max_sectors;
-		}
+			struct md_rdev *rdev;
 
 
-		/*
-		 * Status characters:
-		 *  'D' = Dead/Failed device
-		 *  'a' = Alive but not in-sync
-		 *  'A' = Alive and in-sync
-		 */
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			if (test_bit(Faulty, &rs->dev[i].rdev.flags))
-				DMEMIT("D");
-			else if (!array_in_sync ||
-				 !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				DMEMIT("a");
-			else
-				DMEMIT("A");
+			/*
+			 * The raid set may be doing an initial sync, or it may
+			 * be rebuilding individual components.	 If all the
+			 * devices are In_sync, then it is the raid set that is
+			 * being initialized.
+			 */
+			rdev_for_each(rdev, mddev)
+				if (!test_bit(In_sync, &rdev->flags))
+					*array_in_sync = true;
+#if 0
+			r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
+#endif
 		}
 		}
+	}
+
+	return r;
+}
+
+/* Helper to return @dev name or "-" if !@dev */
+static const char *__get_dev_name(struct dm_dev *dev)
+{
+	return dev ? dev->name : "-";
+}
+
+static void raid_status(struct dm_target *ti, status_type_t type,
+			unsigned int status_flags, char *result, unsigned int maxlen)
+{
+	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
+	struct r5conf *conf = mddev->private;
+	int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
+	bool array_in_sync;
+	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
+	unsigned int sz = 0;
+	unsigned int rebuild_disks;
+	unsigned int write_mostly_params = 0;
+	sector_t progress, resync_max_sectors, resync_mismatches;
+	const char *sync_action;
+	struct raid_type *rt;
+	struct md_rdev *rdev;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		/* *Should* always succeed */
+		rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
+		if (!rt)
+			return;
+
+		DMEMIT("%s %d ", rt->name, mddev->raid_disks);
+
+		/* Access most recent mddev properties for status output */
+		smp_rmb();
+		/* Get sensible max sectors even if raid set not yet started */
+		resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
+				      mddev->resync_max_sectors : mddev->dev_sectors;
+		progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
+		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
+				    atomic64_read(&mddev->resync_mismatches) : 0;
+		sync_action = decipher_sync_action(&rs->md);
+
+		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
+		rdev_for_each(rdev, mddev)
+			DMEMIT(__raid_dev_status(rdev, array_in_sync));
 
 
 		/*
 		/*
-		 * In-sync ratio:
+		 * In-sync/Reshape ratio:
 		 *  The in-sync ratio shows the progress of:
 		 *  The in-sync ratio shows the progress of:
-		 *   - Initializing the array
-		 *   - Rebuilding a subset of devices of the array
+		 *   - Initializing the raid set
+		 *   - Rebuilding a subset of devices of the raid set
 		 *  The user can distinguish between the two by referring
 		 *  The user can distinguish between the two by referring
 		 *  to the status characters.
 		 *  to the status characters.
+		 *
+		 *  The reshape ratio shows the progress of
+		 *  changing the raid layout or the number of
+		 *  disks of a raid set
 		 */
 		 */
-		DMEMIT(" %llu/%llu",
-		       (unsigned long long) sync,
-		       (unsigned long long) rs->md.resync_max_sectors);
+		DMEMIT(" %llu/%llu", (unsigned long long) progress,
+				     (unsigned long long) resync_max_sectors);
 
 
 		/*
 		/*
+		 * v1.5.0+:
+		 *
 		 * Sync action:
 		 * Sync action:
-		 *   See Documentation/device-mapper/dm-raid.c for
+		 *   See Documentation/device-mapper/dm-raid.txt for
 		 *   information on each of these states.
 		 *   information on each of these states.
 		 */
 		 */
-		DMEMIT(" %s", decipher_sync_action(&rs->md));
+		DMEMIT(" %s", sync_action);
 
 
 		/*
 		/*
+		 * v1.5.0+:
+		 *
 		 * resync_mismatches/mismatch_cnt
 		 * resync_mismatches/mismatch_cnt
 		 *   This field shows the number of discrepancies found when
 		 *   This field shows the number of discrepancies found when
-		 *   performing a "check" of the array.
+		 *   performing a "check" of the raid set.
 		 */
 		 */
-		DMEMIT(" %llu",
-		       (strcmp(rs->md.last_sync_action, "check")) ? 0 :
-		       (unsigned long long)
-		       atomic64_read(&rs->md.resync_mismatches));
-		break;
-	case STATUSTYPE_TABLE:
-		/* The string you would use to construct this array */
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
-			    rs->dev[i].data_dev &&
-			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				raid_param_cnt += 2; /* for rebuilds */
-			if (rs->dev[i].data_dev &&
-			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
-				raid_param_cnt += 2;
-		}
-
-		raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
-		if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
-			raid_param_cnt--;
-
-		DMEMIT("%s %u %u", rs->raid_type->name,
-		       raid_param_cnt, rs->md.chunk_sectors);
-
-		if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
-		    (rs->md.recovery_cp == MaxSector))
-			DMEMIT(" sync");
-		if (rs->ctr_flags & CTR_FLAG_NOSYNC)
-			DMEMIT(" nosync");
-
-		for (i = 0; i < rs->md.raid_disks; i++)
-			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
-			    rs->dev[i].data_dev &&
-			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				DMEMIT(" rebuild %u", i);
-
-		if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
-			DMEMIT(" daemon_sleep %lu",
-			       rs->md.bitmap_info.daemon_sleep);
-
-		if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
-			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+		DMEMIT(" %llu", (unsigned long long) resync_mismatches);
 
 
-		if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
-			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
-
-		for (i = 0; i < rs->md.raid_disks; i++)
-			if (rs->dev[i].data_dev &&
-			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
-				DMEMIT(" write_mostly %u", i);
-
-		if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
-			DMEMIT(" max_write_behind %lu",
-			       rs->md.bitmap_info.max_write_behind);
-
-		if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
-			struct r5conf *conf = rs->md.private;
-
-			/* convert from kiB to sectors */
-			DMEMIT(" stripe_cache %d",
-			       conf ? conf->max_nr_stripes * 2 : 0);
-		}
-
-		if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
-			DMEMIT(" region_size %lu",
-			       rs->md.bitmap_info.chunksize >> 9);
-
-		if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
-			DMEMIT(" raid10_copies %u",
-			       raid10_md_layout_to_copies(rs->md.layout));
-
-		if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
-			DMEMIT(" raid10_format %s",
-			       raid10_md_layout_to_format(rs->md.layout));
-
-		DMEMIT(" %d", rs->md.raid_disks);
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			if (rs->dev[i].meta_dev)
-				DMEMIT(" %s", rs->dev[i].meta_dev->name);
-			else
-				DMEMIT(" -");
+		/*
+		 * v1.9.0+:
+		 *
+		 * data_offset (needed for out of space reshaping)
+		 *   This field shows the data offset into the data
+		 *   image LV where the first stripes data starts.
+		 *
+		 * We keep data_offset equal on all raid disks of the set,
+		 * so retrieving it from the first raid disk is sufficient.
+		 */
+		DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+		break;
 
 
-			if (rs->dev[i].data_dev)
-				DMEMIT(" %s", rs->dev[i].data_dev->name);
-			else
-				DMEMIT(" -");
-		}
+	case STATUSTYPE_TABLE:
+		/* Report the table line string you would use to construct this raid set */
+
+		/* Calculate raid parameter count */
+		for (i = 0; i < rs->raid_disks; i++)
+			if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				write_mostly_params += 2;
+		rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
+		raid_param_cnt += rebuild_disks * 2 +
+				  write_mostly_params +
+				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
+				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+		/* Emit table line */
+		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
+		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+					 raid10_md_layout_to_format(mddev->layout));
+		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+					 raid10_md_layout_to_copies(mddev->layout));
+		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
+		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
+		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+					   (unsigned long long) rs->data_offset);
+		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+					  mddev->bitmap_info.daemon_sleep);
+		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+					 max(rs->delta_disks, mddev->delta_disks));
+		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+					 max_nr_stripes);
+		if (rebuild_disks)
+			for (i = 0; i < rs->raid_disks; i++)
+				if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
+					DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
+							 rs->dev[i].rdev.raid_disk);
+		if (write_mostly_params)
+			for (i = 0; i < rs->raid_disks; i++)
+				if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+					DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
+					       rs->dev[i].rdev.raid_disk);
+		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
+					  mddev->bitmap_info.max_write_behind);
+		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+					 mddev->sync_speed_max);
+		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+					 mddev->sync_speed_min);
+		DMEMIT(" %d", rs->raid_disks);
+		for (i = 0; i < rs->raid_disks; i++)
+			DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
+					 __get_dev_name(rs->dev[i].data_dev));
 	}
 	}
 }
 }
 
 
-static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
 {
 {
 	struct raid_set *rs = ti->private;
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 	struct mddev *mddev = &rs->md;
 
 
-	if (!strcasecmp(argv[0], "reshape")) {
-		DMERR("Reshape not supported.");
-		return -EINVAL;
-	}
-
 	if (!mddev->pers || !mddev->pers->sync_request)
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
 		return -EINVAL;
 
 
@@ -1571,11 +3321,10 @@ static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
 		return -EBUSY;
 	else if (!strcasecmp(argv[0], "resync"))
 	else if (!strcasecmp(argv[0], "resync"))
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	else if (!strcasecmp(argv[0], "recover")) {
+		; /* MD_RECOVERY_NEEDED set below */
+	else if (!strcasecmp(argv[0], "recover"))
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	} else {
+	else {
 		if (!strcasecmp(argv[0], "check"))
 		if (!strcasecmp(argv[0], "check"))
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		else if (!!strcasecmp(argv[0], "repair"))
 		else if (!!strcasecmp(argv[0], "repair"))
@@ -1588,11 +3337,11 @@ static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
 		 * canceling read-auto mode
 		 * canceling read-auto mode
 		 */
 		 */
 		mddev->ro = 0;
 		mddev->ro = 0;
-		if (!mddev->suspended)
+		if (!mddev->suspended && mddev->sync_thread)
 			md_wakeup_thread(mddev->sync_thread);
 			md_wakeup_thread(mddev->sync_thread);
 	}
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	if (!mddev->suspended)
+	if (!mddev->suspended && mddev->thread)
 		md_wakeup_thread(mddev->thread);
 		md_wakeup_thread(mddev->thread);
 
 
 	return 0;
 	return 0;
@@ -1602,28 +3351,27 @@ static int raid_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 				iterate_devices_callout_fn fn, void *data)
 {
 {
 	struct raid_set *rs = ti->private;
 	struct raid_set *rs = ti->private;
-	unsigned i;
-	int ret = 0;
+	unsigned int i;
+	int r = 0;
 
 
-	for (i = 0; !ret && i < rs->md.raid_disks; i++)
+	for (i = 0; !r && i < rs->md.raid_disks; i++)
 		if (rs->dev[i].data_dev)
 		if (rs->dev[i].data_dev)
-			ret = fn(ti,
+			r = fn(ti,
 				 rs->dev[i].data_dev,
 				 rs->dev[i].data_dev,
 				 0, /* No offset on data devs */
 				 0, /* No offset on data devs */
 				 rs->md.dev_sectors,
 				 rs->md.dev_sectors,
 				 data);
 				 data);
 
 
-	return ret;
+	return r;
 }
 }
 
 
 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 {
 	struct raid_set *rs = ti->private;
 	struct raid_set *rs = ti->private;
-	unsigned chunk_size = rs->md.chunk_sectors << 9;
-	struct r5conf *conf = rs->md.private;
+	unsigned int chunk_size = to_bytes(rs->md.chunk_sectors);
 
 
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_min(limits, chunk_size);
-	blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
 }
 }
 
 
 static void raid_presuspend(struct dm_target *ti)
 static void raid_presuspend(struct dm_target *ti)
@@ -1637,7 +3385,11 @@ static void raid_postsuspend(struct dm_target *ti)
 {
 {
 	struct raid_set *rs = ti->private;
 	struct raid_set *rs = ti->private;
 
 
-	mddev_suspend(&rs->md);
+	if (test_and_clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
+		if (!rs->md.suspended)
+			mddev_suspend(&rs->md);
+		rs->md.ro = 1;
+	}
 }
 }
 
 
 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@@ -1651,8 +3403,8 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 	for (i = 0; i < rs->md.raid_disks; i++) {
 	for (i = 0; i < rs->md.raid_disks; i++) {
 		r = &rs->dev[i].rdev;
 		r = &rs->dev[i].rdev;
 		if (test_bit(Faulty, &r->flags) && r->sb_page &&
 		if (test_bit(Faulty, &r->flags) && r->sb_page &&
-		    sync_page_io(r, 0, r->sb_size, r->sb_page, REQ_OP_READ, 0,
-				 1)) {
+		    sync_page_io(r, 0, r->sb_size, r->sb_page,
+				 REQ_OP_READ, 0, true)) {
 			DMINFO("Faulty %s device #%d has readable super block."
 			DMINFO("Faulty %s device #%d has readable super block."
 			       "  Attempting to revive it.",
 			       "  Attempting to revive it.",
 			       rs->raid_type->name, i);
 			       rs->raid_type->name, i);
@@ -1661,7 +3413,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 			 * Faulty bit may be set, but sometimes the array can
 			 * Faulty bit may be set, but sometimes the array can
 			 * be suspended before the personalities can respond
 			 * be suspended before the personalities can respond
 			 * by removing the device from the array (i.e. calling
 			 * by removing the device from the array (i.e. calling
-			 * 'hot_remove_disk').  If they haven't yet removed
+			 * 'hot_remove_disk').	If they haven't yet removed
 			 * the failed device, its 'raid_disk' number will be
 			 * the failed device, its 'raid_disk' number will be
 			 * '>= 0' - meaning we must call this function
 			 * '>= 0' - meaning we must call this function
 			 * ourselves.
 			 * ourselves.
@@ -1697,34 +3449,192 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 	}
 	}
 }
 }
 
 
-static void raid_resume(struct dm_target *ti)
+static int __load_dirty_region_bitmap(struct raid_set *rs)
 {
 {
-	struct raid_set *rs = ti->private;
+	int r = 0;
+
+	/* Try loading the bitmap unless "raid0", which does not have one */
+	if (!rs_is_raid0(rs) &&
+	    !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
+		r = bitmap_load(&rs->md);
+		if (r)
+			DMERR("Failed to load bitmap");
+	}
 
 
-	if (rs->raid_type->level) {
-		set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+	return r;
+}
 
 
-		if (!rs->bitmap_loaded) {
-			bitmap_load(&rs->md);
-			rs->bitmap_loaded = 1;
-		} else {
-			/*
-			 * A secondary resume while the device is active.
-			 * Take this opportunity to check whether any failed
-			 * devices are reachable again.
-			 */
-			attempt_restore_of_faulty_devices(rs);
+/* Enforce updating all superblocks */
+static void rs_update_sbs(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	int ro = mddev->ro;
+
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	mddev->ro = 0;
+	md_update_sb(mddev, 1);
+	mddev->ro = ro;
+}
+
+/*
+ * Reshape changes raid algorithm of @rs to new one within personality
+ * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes
+ * disks from a raid set thus growing/shrinking it or resizes the set
+ *
+ * Call mddev_lock_nointr() before!
+ */
+static int rs_start_reshape(struct raid_set *rs)
+{
+	int r;
+	struct mddev *mddev = &rs->md;
+	struct md_personality *pers = mddev->pers;
+
+	r = rs_setup_reshape(rs);
+	if (r)
+		return r;
+
+	/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
+	if (mddev->suspended)
+		mddev_resume(mddev);
+
+	/*
+	 * Check any reshape constraints enforced by the personalility
+	 *
+	 * May as well already kick the reshape off so that * pers->start_reshape() becomes optional.
+	 */
+	r = pers->check_reshape(mddev);
+	if (r) {
+		rs->ti->error = "pers->check_reshape() failed";
+		return r;
+	}
+
+	/*
+	 * Personality may not provide start reshape method in which
+	 * case check_reshape above has already covered everything
+	 */
+	if (pers->start_reshape) {
+		r = pers->start_reshape(mddev);
+		if (r) {
+			rs->ti->error = "pers->start_reshape() failed";
+			return r;
 		}
 		}
+	}
+
+	/* Suspend because a resume will happen in raid_resume() */
+	if (!mddev->suspended)
+		mddev_suspend(mddev);
+
+	/*
+	 * Now reshape got set up, update superblocks to
+	 * reflect the fact so that a table reload will
+	 * access proper superblock content in the ctr.
+	 */
+	rs_update_sbs(rs);
+
+	return 0;
+}
+
+static int raid_preresume(struct dm_target *ti)
+{
+	int r;
+	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
+
+	/* This is a resume after a suspend of the set -> it's already started */
+	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
+		return 0;
+
+	/*
+	 * The superblocks need to be updated on disk if the
+	 * array is new or new devices got added (thus zeroed
+	 * out by userspace) or __load_dirty_region_bitmap
+	 * will overwrite them in core with old data or fail.
+	 */
+	if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags))
+		rs_update_sbs(rs);
+
+	/*
+	 * Disable/enable discard support on raid set after any
+	 * conversion, because devices can have been added
+	 */
+	configure_discard_support(rs);
+
+	/* Load the bitmap from disk unless raid0 */
+	r = __load_dirty_region_bitmap(rs);
+	if (r)
+		return r;
+
+	/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
+	if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) &&
+	    mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
+		r = bitmap_resize(mddev->bitmap, mddev->dev_sectors,
+				  to_bytes(rs->requested_bitmap_chunk_sectors), 0);
+		if (r)
+			DMERR("Failed to resize bitmap");
+	}
+
+	/* Check for any resize/reshape on @rs and adjust/initiate */
+	/* Be prepared for mddev_resume() in raid_resume() */
+	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
+		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		mddev->resync_min = mddev->recovery_cp;
+	}
 
 
-		clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
+	rs_set_capacity(rs);
+
+	/* Check for any reshape request unless new raid set */
+	if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+		/* Initiate a reshape. */
+		mddev_lock_nointr(mddev);
+		r = rs_start_reshape(rs);
+		mddev_unlock(mddev);
+		if (r)
+			DMWARN("Failed to check/start reshape, continuing without change");
+		r = 0;
 	}
 	}
 
 
-	mddev_resume(&rs->md);
+	return r;
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
+
+	if (test_and_set_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
+		/*
+		 * A secondary resume while the device is active.
+		 * Take this opportunity to check whether any failed
+		 * devices are reachable again.
+		 */
+		attempt_restore_of_faulty_devices(rs);
+	} else {
+		mddev->ro = 0;
+		mddev->in_sync = 0;
+
+		/*
+		 * When passing in flags to the ctr, we expect userspace
+		 * to reset them because they made it to the superblocks
+		 * and reload the mapping anyway.
+		 *
+		 * -> only unfreeze recovery in case of a table reload or
+		 *    we'll have a bogus recovery/reshape position
+		 *    retrieved from the superblock by the ctr because
+		 *    the ongoing recovery/reshape will change it after read.
+		 */
+		if (!test_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags))
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+		if (mddev->suspended)
+			mddev_resume(mddev);
+	}
 }
 }
 
 
 static struct target_type raid_target = {
 static struct target_type raid_target = {
 	.name = "raid",
 	.name = "raid",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
 	.dtr = raid_dtr,
@@ -1735,6 +3645,7 @@ static struct target_type raid_target = {
 	.io_hints = raid_io_hints,
 	.io_hints = raid_io_hints,
 	.presuspend = raid_presuspend,
 	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
 	.postsuspend = raid_postsuspend,
+	.preresume = raid_preresume,
 	.resume = raid_resume,
 	.resume = raid_resume,
 };
 };
 
 
@@ -1759,11 +3670,13 @@ module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
 MODULE_PARM_DESC(devices_handle_discard_safely,
 		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 
 
-MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_DESCRIPTION(DM_NAME " raid0/1/10/4/5/6 target");
+MODULE_ALIAS("dm-raid0");
 MODULE_ALIAS("dm-raid1");
 MODULE_ALIAS("dm-raid1");
 MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid6");
 MODULE_ALIAS("dm-raid6");
 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
 MODULE_LICENSE("GPL");

+ 970 - 0
drivers/md/dm-rq.c

@@ -0,0 +1,970 @@
+/*
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-core.h"
+#include "dm-rq.h"
+
+#include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
+
+#define DM_MSG_PREFIX "core-rq"
+
+#define DM_MQ_NR_HW_QUEUES 1
+#define DM_MQ_QUEUE_DEPTH 2048
+static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
+static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
+
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+#define RESERVED_REQUEST_BASED_IOS	256
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq_default(void)
+{
+	return use_blk_mq;
+}
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+	return md->use_blk_mq;
+}
+EXPORT_SYMBOL_GPL(dm_use_blk_mq);
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	return __dm_get_module_param(&reserved_rq_based_ios,
+				     RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
+static unsigned dm_get_blk_mq_nr_hw_queues(void)
+{
+	return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
+}
+
+static unsigned dm_get_blk_mq_queue_depth(void)
+{
+	return __dm_get_module_param(&dm_mq_queue_depth,
+				     DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
+}
+
+int dm_request_based(struct mapped_device *md)
+{
+	return blk_queue_stackable(md->queue);
+}
+
+static void dm_old_start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void dm_start_queue(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		dm_old_start_queue(q);
+	else {
+		blk_mq_start_stopped_hw_queues(q, true);
+		blk_mq_kick_requeue_list(q);
+	}
+}
+
+static void dm_old_stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q)) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
+		return;
+	}
+
+	blk_stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void dm_stop_queue(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		dm_old_stop_queue(q);
+	else
+		blk_mq_stop_hw_queues(q);
+}
+
+static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
+						gfp_t gfp_mask)
+{
+	return mempool_alloc(md->io_pool, gfp_mask);
+}
+
+static void free_old_rq_tio(struct dm_rq_target_io *tio)
+{
+	mempool_free(tio, tio->md->io_pool);
+}
+
+static struct request *alloc_old_clone_request(struct mapped_device *md,
+					       gfp_t gfp_mask)
+{
+	return mempool_alloc(md->rq_pool, gfp_mask);
+}
+
+static void free_old_clone_request(struct mapped_device *md, struct request *rq)
+{
+	mempool_free(rq, md->rq_pool);
+}
+
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone)
+{
+	struct dm_rq_clone_bio_info *info =
+		container_of(clone, struct dm_rq_clone_bio_info, clone);
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+	int error = clone->bi_error;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies - tio->duration_jiffies;
+		dm_stats_account_io(&md->stats, rq_data_dir(orig),
+				    blk_rq_pos(orig), tio->n_sectors, true,
+				    tio->duration_jiffies, &tio->stats_aux);
+	}
+}
+
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
+{
+	atomic_dec(&md->pending[rw]);
+
+	/* nudge anyone waiting on suspend queue */
+	if (!md_in_flight(md))
+		wake_up(&md->wait);
+
+	/*
+	 * Run this off this callpath, as drivers could invoke end_io while
+	 * inside their request_fn (and holding the queue lock). Calling
+	 * back into ->request_fn() could deadlock attempting to grab the
+	 * queue lock again.
+	 */
+	if (!md->queue->mq_ops && run_queue)
+		blk_run_queue_async(md->queue);
+
+	/*
+	 * dm_put() must be at the end of this function. See the comment above
+	 */
+	dm_put(md);
+}
+
+static void free_rq_clone(struct request *clone)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+
+	blk_rq_unprep_clone(clone);
+
+	/*
+	 * It is possible for a clone_old_rq() allocated clone to
+	 * get passed in -- it may not yet have a request_queue.
+	 * This is known to occur if the error target replaces
+	 * a multipath target that has a request_fn queue stacked
+	 * on blk-mq queue(s).
+	 */
+	if (clone->q && clone->q->mq_ops)
+		/* stacked on blk-mq queue(s) */
+		tio->ti->type->release_clone_rq(clone);
+	else if (!md->queue->mq_ops)
+		/* request_fn queue stacked on request_fn queue(s) */
+		free_old_clone_request(md, clone);
+
+	if (!md->queue->mq_ops)
+		free_old_rq_tio(tio);
+}
+
+/*
+ * Complete the clone and the original request.
+ * Must be called without clone's queue lock held,
+ * see end_clone_request() for more details.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+	int rw = rq_data_dir(clone);
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+		rq->errors = clone->errors;
+		rq->resid_len = clone->resid_len;
+
+		if (rq->sense)
+			/*
+			 * We are using the sense buffer of the original
+			 * request.
+			 * So setting the length of the sense data is enough.
+			 */
+			rq->sense_len = clone->sense_len;
+	}
+
+	free_rq_clone(clone);
+	rq_end_stats(md, rq);
+	if (!rq->q->mq_ops)
+		blk_end_request_all(rq, error);
+	else
+		blk_mq_end_request(rq, error);
+	rq_completed(md, rw, true);
+}
+
+static void dm_unprep_request(struct request *rq)
+{
+	struct dm_rq_target_io *tio = tio_from_request(rq);
+	struct request *clone = tio->clone;
+
+	if (!rq->q->mq_ops) {
+		rq->special = NULL;
+		rq->cmd_flags &= ~REQ_DONTPREP;
+	}
+
+	if (clone)
+		free_rq_clone(clone);
+	else if (!tio->md->queue->mq_ops)
+		free_old_rq_tio(tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+static void dm_old_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_requeue_request(q, rq);
+	blk_run_queue_async(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_mq_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	blk_mq_requeue_request(rq);
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!blk_queue_stopped(q))
+		blk_mq_kick_requeue_list(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_original_request(struct mapped_device *md,
+					struct request *rq)
+{
+	int rw = rq_data_dir(rq);
+
+	rq_end_stats(md, rq);
+	dm_unprep_request(rq);
+
+	if (!rq->q->mq_ops)
+		dm_old_requeue_request(rq);
+	else
+		dm_mq_requeue_request(rq);
+
+	rq_completed(md, rw, false);
+}
+
+static void dm_done(struct request *clone, int error, bool mapped)
+{
+	int r = error;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	dm_request_endio_fn rq_end_io = NULL;
+
+	if (tio->ti) {
+		rq_end_io = tio->ti->type->rq_end_io;
+
+		if (mapped && rq_end_io)
+			r = rq_end_io(tio->ti, clone, error, &tio->info);
+	}
+
+	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
+		     !clone->q->limits.max_write_same_sectors))
+		disable_write_same(tio->md);
+
+	if (r <= 0)
+		/* The target wants to complete the I/O */
+		dm_end_request(clone, r);
+	else if (r == DM_ENDIO_INCOMPLETE)
+		/* The target will handle the I/O */
+		return;
+	else if (r == DM_ENDIO_REQUEUE)
+		/* The target wants to requeue the I/O */
+		dm_requeue_original_request(tio->md, tio->orig);
+	else {
+		DMWARN("unimplemented target endio return value: %d", r);
+		BUG();
+	}
+}
+
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+	bool mapped = true;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
+	struct request *clone = tio->clone;
+	int rw;
+
+	if (!clone) {
+		rq_end_stats(tio->md, rq);
+		rw = rq_data_dir(rq);
+		if (!rq->q->mq_ops) {
+			blk_end_request_all(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+			free_old_rq_tio(tio);
+		} else {
+			blk_mq_end_request(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+		}
+		return;
+	}
+
+	if (rq->cmd_flags & REQ_FAILED)
+		mapped = false;
+
+	dm_done(clone, tio->error, mapped);
+}
+
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *rq, int error)
+{
+	struct dm_rq_target_io *tio = tio_from_request(rq);
+
+	tio->error = error;
+	if (!rq->q->mq_ops)
+		blk_complete_request(rq);
+	else
+		blk_mq_complete_request(rq, error);
+}
+
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
+ */
+static void dm_kill_unmapped_request(struct request *rq, int error)
+{
+	rq->cmd_flags |= REQ_FAILED;
+	dm_complete_request(rq, error);
+}
+
+/*
+ * Called with the clone's queue lock held (in the case of .request_fn)
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	if (!clone->q->mq_ops) {
+		/*
+		 * For just cleaning up the information of the queue in which
+		 * the clone was dispatched.
+		 * The clone is *NOT* freed actually here because it is alloced
+		 * from dm own mempool (REQ_ALLOCED isn't set).
+		 */
+		__blk_put_request(clone->q, clone);
+	}
+
+	/*
+	 * Actual request completion is done in a softirq context which doesn't
+	 * hold the clone's queue lock.  Otherwise, deadlock could occur because:
+	 *     - another request may be submitted by the upper level driver
+	 *       of the stacking during the completion
+	 *     - the submission which requires queue lock may be done
+	 *       against this clone's queue
+	 */
+	dm_complete_request(tio->orig, error);
+}
+
+static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+{
+	int r;
+
+	if (blk_queue_io_stat(clone->q))
+		clone->cmd_flags |= REQ_IO_STAT;
+
+	clone->start_time = jiffies;
+	r = blk_insert_cloned_request(clone->q, clone);
+	if (r)
+		/* must complete clone in terms of original request */
+		dm_complete_request(rq, r);
+}
+
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
+{
+	struct dm_rq_target_io *tio = data;
+	struct dm_rq_clone_bio_info *info =
+		container_of(bio, struct dm_rq_clone_bio_info, clone);
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	int r;
+
+	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+			      dm_rq_bio_constructor, tio);
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
+	clone->end_io = end_clone_request;
+	clone->end_io_data = tio;
+
+	tio->clone = clone;
+
+	return 0;
+}
+
+static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
+				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	/*
+	 * Create clone for use with .request_fn request_queue
+	 */
+	struct request *clone;
+
+	clone = alloc_old_clone_request(md, gfp_mask);
+	if (!clone)
+		return NULL;
+
+	blk_rq_init(NULL, clone);
+	if (setup_clone(clone, rq, tio, gfp_mask)) {
+		/* -ENOMEM */
+		free_old_clone_request(md, clone);
+		return NULL;
+	}
+
+	return clone;
+}
+
+static void map_tio_request(struct kthread_work *work);
+
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+		     struct mapped_device *md)
+{
+	tio->md = md;
+	tio->ti = NULL;
+	tio->clone = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	/*
+	 * Avoid initializing info for blk-mq; it passes
+	 * target-specific data through info.ptr
+	 * (see: dm_mq_init_request)
+	 */
+	if (!md->init_tio_pdu)
+		memset(&tio->info, 0, sizeof(tio->info));
+	if (md->kworker_task)
+		init_kthread_work(&tio->work, map_tio_request);
+}
+
+static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
+					       struct mapped_device *md,
+					       gfp_t gfp_mask)
+{
+	struct dm_rq_target_io *tio;
+	int srcu_idx;
+	struct dm_table *table;
+
+	tio = alloc_old_rq_tio(md, gfp_mask);
+	if (!tio)
+		return NULL;
+
+	init_tio(tio, rq, md);
+
+	table = dm_get_live_table(md, &srcu_idx);
+	/*
+	 * Must clone a request if this .request_fn DM device
+	 * is stacked on .request_fn device(s).
+	 */
+	if (!dm_table_all_blk_mq_devices(table)) {
+		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
+			dm_put_live_table(md, srcu_idx);
+			free_old_rq_tio(tio);
+			return NULL;
+		}
+	}
+	dm_put_live_table(md, srcu_idx);
+
+	return tio;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_rq_target_io *tio;
+
+	if (unlikely(rq->special)) {
+		DMWARN("Already has something in rq->special.");
+		return BLKPREP_KILL;
+	}
+
+	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
+	if (!tio)
+		return BLKPREP_DEFER;
+
+	rq->special = tio;
+	rq->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+}
+
+/*
+ * Returns:
+ * 0                : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0              : the request was completed due to failure
+ */
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
+		       struct mapped_device *md)
+{
+	int r;
+	struct dm_target *ti = tio->ti;
+	struct request *clone = NULL;
+
+	if (tio->clone) {
+		clone = tio->clone;
+		r = ti->type->map_rq(ti, clone, &tio->info);
+	} else {
+		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+		if (r < 0) {
+			/* The target wants to complete the I/O */
+			dm_kill_unmapped_request(rq, r);
+			return r;
+		}
+		if (r != DM_MAPIO_REMAPPED)
+			return r;
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
+	}
+
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		/* The target has taken the I/O to submit by itself later */
+		break;
+	case DM_MAPIO_REMAPPED:
+		/* The target has remapped the I/O so dispatch it */
+		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+				     blk_rq_pos(rq));
+		dm_dispatch_clone_request(clone, rq);
+		break;
+	case DM_MAPIO_REQUEUE:
+		/* The target wants to requeue the I/O */
+		dm_requeue_original_request(md, tio->orig);
+		break;
+	default:
+		if (r > 0) {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+
+		/* The target wants to complete the I/O */
+		dm_kill_unmapped_request(rq, r);
+		return r;
+	}
+
+	return 0;
+}
+
+static void dm_start_request(struct mapped_device *md, struct request *orig)
+{
+	if (!orig->q->mq_ops)
+		blk_start_request(orig);
+	else
+		blk_mq_start_request(orig);
+	atomic_inc(&md->pending[rq_data_dir(orig)]);
+
+	if (md->seq_rq_merge_deadline_usecs) {
+		md->last_rq_pos = rq_end_sector(orig);
+		md->last_rq_rw = rq_data_dir(orig);
+		md->last_rq_start_time = ktime_get();
+	}
+
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies;
+		tio->n_sectors = blk_rq_sectors(orig);
+		dm_stats_account_io(&md->stats, rq_data_dir(orig),
+				    blk_rq_pos(orig), tio->n_sectors, false, 0,
+				    &tio->stats_aux);
+	}
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+}
+
+static void map_tio_request(struct kthread_work *work)
+{
+	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+	struct request *rq = tio->orig;
+	struct mapped_device *md = tio->md;
+
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+		dm_requeue_original_request(md, rq);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count)
+{
+	unsigned deadline;
+
+	if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED)
+		return count;
+
+	if (kstrtouint(buf, 10, &deadline))
+		return -EINVAL;
+
+	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+	md->seq_rq_merge_deadline_usecs = deadline;
+
+	return count;
+}
+
+static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+	ktime_t kt_deadline;
+
+	if (!md->seq_rq_merge_deadline_usecs)
+		return false;
+
+	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+	return !ktime_after(ktime_get(), kt_deadline);
+}
+
+/*
+ * q->request_fn for old request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_old_request_fn(struct request_queue *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_target *ti = md->immutable_target;
+	struct request *rq;
+	struct dm_rq_target_io *tio;
+	sector_t pos = 0;
+
+	if (unlikely(!ti)) {
+		int srcu_idx;
+		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+		ti = dm_table_find_target(map, pos);
+		dm_put_live_table(md, srcu_idx);
+	}
+
+	/*
+	 * For suspend, check blk_queue_stopped() and increment
+	 * ->pending within a single queue_lock not to increment the
+	 * number of in-flight I/Os after the queue is stopped in
+	 * dm_suspend().
+	 */
+	while (!blk_queue_stopped(q)) {
+		rq = blk_peek_request(q);
+		if (!rq)
+			return;
+
+		/* always use block 0 to find the target for flushes for now */
+		pos = 0;
+		if (req_op(rq) != REQ_OP_FLUSH)
+			pos = blk_rq_pos(rq);
+
+		if ((dm_old_request_peeked_before_merge_deadline(md) &&
+		     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
+		    (ti->type->busy && ti->type->busy(ti))) {
+			blk_delay_queue(q, 10);
+			return;
+		}
+
+		dm_start_request(md, rq);
+
+		tio = tio_from_request(rq);
+		/* Establish tio->ti before queuing work (map_tio_request) */
+		tio->ti = ti;
+		queue_kthread_work(&md->kworker, &tio->work);
+		BUG_ON(!irqs_disabled());
+	}
+}
+
+/*
+ * Fully initialize a .request_fn request-based queue.
+ */
+int dm_old_init_request_queue(struct mapped_device *md)
+{
+	/* Fully initialize the queue */
+	if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL))
+		return -EINVAL;
+
+	/* disable dm_old_request_fn's merge heuristic by default */
+	md->seq_rq_merge_deadline_usecs = 0;
+
+	dm_init_normal_md_queue(md);
+	blk_queue_softirq_done(md->queue, dm_softirq_done);
+	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
+
+	/* Initialize the request-based DM worker thread */
+	init_kthread_worker(&md->kworker);
+	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+				       "kdmwork-%s", dm_device_name(md));
+	if (IS_ERR(md->kworker_task))
+		return PTR_ERR(md->kworker_task);
+
+	elv_register_queue(md->queue);
+
+	return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+		       unsigned int hctx_idx, unsigned int request_idx,
+		       unsigned int numa_node)
+{
+	struct mapped_device *md = data;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * Must initialize md member of tio, otherwise it won't
+	 * be available in dm_mq_queue_rq.
+	 */
+	tio->md = md;
+
+	if (md->init_tio_pdu) {
+		/* target-specific per-io data is immediately after the tio */
+		tio->info.ptr = tio + 1;
+	}
+
+	return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+	struct mapped_device *md = tio->md;
+	struct dm_target *ti = md->immutable_target;
+
+	if (unlikely(!ti)) {
+		int srcu_idx;
+		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+		ti = dm_table_find_target(map, 0);
+		dm_put_live_table(md, srcu_idx);
+	}
+
+	if (ti->type->busy && ti->type->busy(ti))
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	dm_start_request(md, rq);
+
+	/* Init tio using md established in .init_request */
+	init_tio(tio, rq, md);
+
+	/*
+	 * Establish tio->ti before calling map_request().
+	 */
+	tio->ti = ti;
+
+	/* Direct call is fine since .queue_rq allows allocations */
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+		/* Undo dm_start_request() before requeuing */
+		rq_end_stats(md, rq);
+		rq_completed(md, rq_data_dir(rq), false);
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+	.queue_rq = dm_mq_queue_rq,
+	.map_queue = blk_mq_map_queue,
+	.complete = dm_softirq_done,
+	.init_request = dm_mq_init_request,
+};
+
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
+{
+	struct request_queue *q;
+	struct dm_target *immutable_tgt;
+	int err;
+
+	if (!dm_table_all_blk_mq_devices(t)) {
+		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
+		return -EINVAL;
+	}
+
+	md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
+	if (!md->tag_set)
+		return -ENOMEM;
+
+	md->tag_set->ops = &dm_mq_ops;
+	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
+	md->tag_set->numa_node = md->numa_node_id;
+	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
+	md->tag_set->driver_data = md;
+
+	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+	immutable_tgt = dm_table_get_immutable_target(t);
+	if (immutable_tgt && immutable_tgt->per_io_data_size) {
+		/* any target-specific per-io data is immediately after the tio */
+		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
+		md->init_tio_pdu = true;
+	}
+
+	err = blk_mq_alloc_tag_set(md->tag_set);
+	if (err)
+		goto out_kfree_tag_set;
+
+	q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+	if (IS_ERR(q)) {
+		err = PTR_ERR(q);
+		goto out_tag_set;
+	}
+	dm_init_md_queue(md);
+
+	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
+	blk_mq_register_disk(md->disk);
+
+	return 0;
+
+out_tag_set:
+	blk_mq_free_tag_set(md->tag_set);
+out_kfree_tag_set:
+	kfree(md->tag_set);
+
+	return err;
+}
+
+void dm_mq_cleanup_mapped_device(struct mapped_device *md)
+{
+	if (md->tag_set) {
+		blk_mq_free_tag_set(md->tag_set);
+		kfree(md->tag_set);
+	}
+}
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
+module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
+
+module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");

+ 64 - 0
drivers/md/dm-rq.h

@@ -0,0 +1,64 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_RQ_INTERNAL_H
+#define DM_RQ_INTERNAL_H
+
+#include <linux/bio.h>
+#include <linux/kthread.h>
+
+#include "dm-stats.h"
+
+struct mapped_device;
+
+/*
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+	struct mapped_device *md;
+	struct dm_target *ti;
+	struct request *orig, *clone;
+	struct kthread_work work;
+	int error;
+	union map_info info;
+	struct dm_stats_aux stats_aux;
+	unsigned long duration_jiffies;
+	unsigned n_sectors;
+};
+
+/*
+ * For request-based dm - the bio clones we allocate are embedded in these
+ * structs.
+ *
+ * We allocate these with bio_alloc_bioset, using the front_pad parameter when
+ * the bioset is created - this means the bio has to come at the end of the
+ * struct.
+ */
+struct dm_rq_clone_bio_info {
+	struct bio *orig;
+	struct dm_rq_target_io *tio;
+	struct bio clone;
+};
+
+bool dm_use_blk_mq_default(void);
+bool dm_use_blk_mq(struct mapped_device *md);
+
+int dm_old_init_request_queue(struct mapped_device *md);
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
+void dm_mq_cleanup_mapped_device(struct mapped_device *md);
+
+void dm_start_queue(struct request_queue *q);
+void dm_stop_queue(struct request_queue *q);
+
+unsigned dm_get_reserved_rq_based_ios(void);
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count);
+
+#endif

+ 8 - 0
drivers/md/dm-snap.c

@@ -2302,6 +2302,13 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 	return do_origin(o->dev, bio);
 	return do_origin(o->dev, bio);
 }
 }
 
 
+static long origin_direct_access(struct dm_target *ti, sector_t sector,
+		void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	DMWARN("device does not support dax.");
+	return -EIO;
+}
+
 /*
 /*
  * Set the target "max_io_len" field to the minimum of all the snapshots'
  * Set the target "max_io_len" field to the minimum of all the snapshots'
  * chunk sizes.
  * chunk sizes.
@@ -2361,6 +2368,7 @@ static struct target_type origin_target = {
 	.postsuspend = origin_postsuspend,
 	.postsuspend = origin_postsuspend,
 	.status  = origin_status,
 	.status  = origin_status,
 	.iterate_devices = origin_iterate_devices,
 	.iterate_devices = origin_iterate_devices,
+	.direct_access = origin_direct_access,
 };
 };
 
 
 static struct target_type snapshot_target = {
 static struct target_type snapshot_target = {

+ 1 - 1
drivers/md/dm-stats.c

@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 
 
-#include "dm.h"
+#include "dm-core.h"
 #include "dm-stats.h"
 #include "dm-stats.h"
 
 
 #define DM_MSG_PREFIX "stats"
 #define DM_MSG_PREFIX "stats"

+ 25 - 1
drivers/md/dm-stripe.c

@@ -308,6 +308,29 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 	return DM_MAPIO_REMAPPED;
 }
 }
 
 
+static long stripe_direct_access(struct dm_target *ti, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct stripe_c *sc = ti->private;
+	uint32_t stripe;
+	struct block_device *bdev;
+	struct blk_dax_ctl dax = {
+		.size = size,
+	};
+	long ret;
+
+	stripe_map_sector(sc, sector, &stripe, &dax.sector);
+
+	dax.sector += sc->stripe[stripe].physical_start;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	ret = bdev_direct_access(bdev, &dax);
+	*kaddr = dax.addr;
+	*pfn = dax.pfn;
+
+	return ret;
+}
+
 /*
 /*
  * Stripe status:
  * Stripe status:
  *
  *
@@ -416,7 +439,7 @@ static void stripe_io_hints(struct dm_target *ti,
 
 
 static struct target_type stripe_target = {
 static struct target_type stripe_target = {
 	.name   = "striped",
 	.name   = "striped",
-	.version = {1, 5, 1},
+	.version = {1, 6, 0},
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
 	.dtr    = stripe_dtr,
@@ -425,6 +448,7 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
 	.io_hints = stripe_io_hints,
+	.direct_access = stripe_direct_access,
 };
 };
 
 
 int __init dm_stripe_init(void)
 int __init dm_stripe_init(void)

+ 2 - 1
drivers/md/dm-sysfs.c

@@ -6,7 +6,8 @@
 
 
 #include <linux/sysfs.h>
 #include <linux/sysfs.h>
 #include <linux/dm-ioctl.h>
 #include <linux/dm-ioctl.h>
-#include "dm.h"
+#include "dm-core.h"
+#include "dm-rq.h"
 
 
 struct dm_sysfs_attr {
 struct dm_sysfs_attr {
 	struct attribute attr;
 	struct attribute attr;

+ 90 - 24
drivers/md/dm-table.c

@@ -5,7 +5,7 @@
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
-#include "dm.h"
+#include "dm-core.h"
 
 
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
@@ -43,8 +43,10 @@ struct dm_table {
 	struct dm_target *targets;
 	struct dm_target *targets;
 
 
 	struct target_type *immutable_target_type;
 	struct target_type *immutable_target_type;
-	unsigned integrity_supported:1;
-	unsigned singleton:1;
+
+	bool integrity_supported:1;
+	bool singleton:1;
+	bool all_blk_mq:1;
 
 
 	/*
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * Indicates the rw permissions for the new logical
@@ -206,6 +208,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
+	t->type = DM_TYPE_NONE;
 	t->mode = mode;
 	t->mode = mode;
 	t->md = md;
 	t->md = md;
 	*result = t;
 	*result = t;
@@ -703,7 +706,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 			      dm_device_name(t->md), type);
 			      dm_device_name(t->md), type);
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
-		t->singleton = 1;
+		t->singleton = true;
 	}
 	}
 
 
 	if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
 	if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
@@ -824,22 +827,70 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 }
 }
 EXPORT_SYMBOL(dm_consume_args);
 EXPORT_SYMBOL(dm_consume_args);
 
 
+static bool __table_type_bio_based(unsigned table_type)
+{
+	return (table_type == DM_TYPE_BIO_BASED ||
+		table_type == DM_TYPE_DAX_BIO_BASED);
+}
+
 static bool __table_type_request_based(unsigned table_type)
 static bool __table_type_request_based(unsigned table_type)
 {
 {
 	return (table_type == DM_TYPE_REQUEST_BASED ||
 	return (table_type == DM_TYPE_REQUEST_BASED ||
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 }
 }
 
 
-static int dm_table_set_type(struct dm_table *t)
+void dm_table_set_type(struct dm_table *t, unsigned type)
+{
+	t->type = type;
+}
+EXPORT_SYMBOL_GPL(dm_table_set_type);
+
+static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+			       sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && blk_queue_dax(q);
+}
+
+static bool dm_table_supports_dax(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	/* Ensure that all targets support DAX. */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->type->direct_access)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    !ti->type->iterate_devices(ti, device_supports_dax, NULL))
+			return false;
+	}
+
+	return true;
+}
+
+static int dm_table_determine_type(struct dm_table *t)
 {
 {
 	unsigned i;
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
-	bool use_blk_mq = false;
+	bool verify_blk_mq = false;
 	struct dm_target *tgt;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct dm_dev_internal *dd;
-	struct list_head *devices;
+	struct list_head *devices = dm_table_get_devices(t);
 	unsigned live_md_type = dm_get_md_type(t->md);
 	unsigned live_md_type = dm_get_md_type(t->md);
 
 
+	if (t->type != DM_TYPE_NONE) {
+		/* target already set the table's type */
+		if (t->type == DM_TYPE_BIO_BASED)
+			return 0;
+		BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
+		goto verify_rq_based;
+	}
+
 	for (i = 0; i < t->num_targets; i++) {
 	for (i = 0; i < t->num_targets; i++) {
 		tgt = t->targets + i;
 		tgt = t->targets + i;
 		if (dm_target_hybrid(tgt))
 		if (dm_target_hybrid(tgt))
@@ -871,11 +922,27 @@ static int dm_table_set_type(struct dm_table *t)
 	if (bio_based) {
 	if (bio_based) {
 		/* We must use this table as bio-based */
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
 		t->type = DM_TYPE_BIO_BASED;
+		if (dm_table_supports_dax(t) ||
+		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED))
+			t->type = DM_TYPE_DAX_BIO_BASED;
 		return 0;
 		return 0;
 	}
 	}
 
 
 	BUG_ON(!request_based); /* No targets in this table */
 	BUG_ON(!request_based); /* No targets in this table */
 
 
+	if (list_empty(devices) && __table_type_request_based(live_md_type)) {
+		/* inherit live MD type */
+		t->type = live_md_type;
+		return 0;
+	}
+
+	/*
+	 * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
+	 * having a compatible target use dm_table_set_type.
+	 */
+	t->type = DM_TYPE_REQUEST_BASED;
+
+verify_rq_based:
 	/*
 	/*
 	 * Request-based dm supports only tables that have a single target now.
 	 * Request-based dm supports only tables that have a single target now.
 	 * To support multiple targets, request splitting support is needed,
 	 * To support multiple targets, request splitting support is needed,
@@ -888,7 +955,6 @@ static int dm_table_set_type(struct dm_table *t)
 	}
 	}
 
 
 	/* Non-request-stackable devices can't be used for request-based dm */
 	/* Non-request-stackable devices can't be used for request-based dm */
-	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 
 
@@ -899,10 +965,10 @@ static int dm_table_set_type(struct dm_table *t)
 		}
 		}
 
 
 		if (q->mq_ops)
 		if (q->mq_ops)
-			use_blk_mq = true;
+			verify_blk_mq = true;
 	}
 	}
 
 
-	if (use_blk_mq) {
+	if (verify_blk_mq) {
 		/* verify _all_ devices in the table are blk-mq devices */
 		/* verify _all_ devices in the table are blk-mq devices */
 		list_for_each_entry(dd, devices, list)
 		list_for_each_entry(dd, devices, list)
 			if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
 			if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
@@ -910,14 +976,9 @@ static int dm_table_set_type(struct dm_table *t)
 				      " are blk-mq request-stackable");
 				      " are blk-mq request-stackable");
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-		t->type = DM_TYPE_MQ_REQUEST_BASED;
 
 
-	} else if (list_empty(devices) && __table_type_request_based(live_md_type)) {
-		/* inherit live MD type */
-		t->type = live_md_type;
-
-	} else
-		t->type = DM_TYPE_REQUEST_BASED;
+		t->all_blk_mq = true;
+	}
 
 
 	return 0;
 	return 0;
 }
 }
@@ -956,14 +1017,19 @@ struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
 	return NULL;
 	return NULL;
 }
 }
 
 
+bool dm_table_bio_based(struct dm_table *t)
+{
+	return __table_type_bio_based(dm_table_get_type(t));
+}
+
 bool dm_table_request_based(struct dm_table *t)
 bool dm_table_request_based(struct dm_table *t)
 {
 {
 	return __table_type_request_based(dm_table_get_type(t));
 	return __table_type_request_based(dm_table_get_type(t));
 }
 }
 
 
-bool dm_table_mq_request_based(struct dm_table *t)
+bool dm_table_all_blk_mq_devices(struct dm_table *t)
 {
 {
-	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
+	return t->all_blk_mq;
 }
 }
 
 
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
@@ -978,7 +1044,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	if (type == DM_TYPE_BIO_BASED)
+	if (__table_type_bio_based(type))
 		for (i = 0; i < t->num_targets; i++) {
 		for (i = 0; i < t->num_targets; i++) {
 			tgt = t->targets + i;
 			tgt = t->targets + i;
 			per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
 			per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
@@ -1106,7 +1172,7 @@ static int dm_table_register_integrity(struct dm_table *t)
 		return 0;
 		return 0;
 
 
 	if (!integrity_profile_exists(dm_disk(md))) {
 	if (!integrity_profile_exists(dm_disk(md))) {
-		t->integrity_supported = 1;
+		t->integrity_supported = true;
 		/*
 		/*
 		 * Register integrity profile during table load; we can do
 		 * Register integrity profile during table load; we can do
 		 * this because the final profile must match during resume.
 		 * this because the final profile must match during resume.
@@ -1129,7 +1195,7 @@ static int dm_table_register_integrity(struct dm_table *t)
 	}
 	}
 
 
 	/* Preserve existing integrity profile */
 	/* Preserve existing integrity profile */
-	t->integrity_supported = 1;
+	t->integrity_supported = true;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1141,9 +1207,9 @@ int dm_table_complete(struct dm_table *t)
 {
 {
 	int r;
 	int r;
 
 
-	r = dm_table_set_type(t);
+	r = dm_table_determine_type(t);
 	if (r) {
 	if (r) {
-		DMERR("unable to set table type");
+		DMERR("unable to determine table type");
 		return r;
 		return r;
 	}
 	}
 
 

+ 9 - 2
drivers/md/dm-target.c

@@ -4,7 +4,7 @@
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
-#include "dm.h"
+#include "dm-core.h"
 
 
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/init.h>
@@ -148,9 +148,15 @@ static void io_err_release_clone_rq(struct request *clone)
 {
 {
 }
 }
 
 
+static long io_err_direct_access(struct dm_target *ti, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	return -EIO;
+}
+
 static struct target_type error_target = {
 static struct target_type error_target = {
 	.name = "error",
 	.name = "error",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.features = DM_TARGET_WILDCARD,
 	.features = DM_TARGET_WILDCARD,
 	.ctr  = io_err_ctr,
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.dtr  = io_err_dtr,
@@ -158,6 +164,7 @@ static struct target_type error_target = {
 	.map_rq = io_err_map_rq,
 	.map_rq = io_err_map_rq,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
 	.release_clone_rq = io_err_release_clone_rq,
+	.direct_access = io_err_direct_access,
 };
 };
 
 
 int __init dm_target_init(void)
 int __init dm_target_init(void)

+ 30 - 0
drivers/md/dm-thin-metadata.c

@@ -1677,6 +1677,36 @@ int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *resu
 	return r;
 	return r;
 }
 }
 
 
+int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
+{
+	int r = 0;
+
+	down_write(&pmd->root_lock);
+	for (; b != e; b++) {
+		r = dm_sm_inc_block(pmd->data_sm, b);
+		if (r)
+			break;
+	}
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
+int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
+{
+	int r = 0;
+
+	down_write(&pmd->root_lock);
+	for (; b != e; b++) {
+		r = dm_sm_dec_block(pmd->data_sm, b);
+		if (r)
+			break;
+	}
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 {
 {
 	int r;
 	int r;

+ 3 - 0
drivers/md/dm-thin-metadata.h

@@ -197,6 +197,9 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
 
 
 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
 
 
+int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e);
+int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e);
+
 /*
 /*
  * Returns -ENOSPC if the new size is too small and already allocated
  * Returns -ENOSPC if the new size is too small and already allocated
  * blocks would be lost.
  * blocks would be lost.

+ 91 - 11
drivers/md/dm-thin.c

@@ -253,6 +253,7 @@ struct pool {
 	struct bio_list deferred_flush_bios;
 	struct bio_list deferred_flush_bios;
 	struct list_head prepared_mappings;
 	struct list_head prepared_mappings;
 	struct list_head prepared_discards;
 	struct list_head prepared_discards;
+	struct list_head prepared_discards_pt2;
 	struct list_head active_thins;
 	struct list_head active_thins;
 
 
 	struct dm_deferred_set *shared_read_ds;
 	struct dm_deferred_set *shared_read_ds;
@@ -269,6 +270,7 @@ struct pool {
 
 
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
 	process_mapping_fn process_prepared_discard;
+	process_mapping_fn process_prepared_discard_pt2;
 
 
 	struct dm_bio_prison_cell **cell_sort_array;
 	struct dm_bio_prison_cell **cell_sort_array;
 };
 };
@@ -1001,7 +1003,8 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
 
 
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/
 
 
-static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
+static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
+						   struct bio *discard_parent)
 {
 {
 	/*
 	/*
 	 * We've already unmapped this range of blocks, but before we
 	 * We've already unmapped this range of blocks, but before we
@@ -1014,7 +1017,7 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m
 	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
 	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
 	struct discard_op op;
 	struct discard_op op;
 
 
-	begin_discard(&op, tc, m->bio);
+	begin_discard(&op, tc, discard_parent);
 	while (b != end) {
 	while (b != end) {
 		/* find start of unmapped run */
 		/* find start of unmapped run */
 		for (; b < end; b++) {
 		for (; b < end; b++) {
@@ -1049,28 +1052,101 @@ out:
 	end_discard(&op, r);
 	end_discard(&op, r);
 }
 }
 
 
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
+{
+	unsigned long flags;
+	struct pool *pool = m->tc->pool;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	list_add_tail(&m->list, &pool->prepared_discards_pt2);
+	spin_unlock_irqrestore(&pool->lock, flags);
+	wake_worker(pool);
+}
+
+static void passdown_endio(struct bio *bio)
+{
+	/*
+	 * It doesn't matter if the passdown discard failed, we still want
+	 * to unmap (we ignore err).
+	 */
+	queue_passdown_pt2(bio->bi_private);
+}
+
+static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 {
 {
 	int r;
 	int r;
 	struct thin_c *tc = m->tc;
 	struct thin_c *tc = m->tc;
 	struct pool *pool = tc->pool;
 	struct pool *pool = tc->pool;
+	struct bio *discard_parent;
+	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
 
 
+	/*
+	 * Only this thread allocates blocks, so we can be sure that the
+	 * newly unmapped blocks will not be allocated before the end of
+	 * the function.
+	 */
 	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
 	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
 	if (r) {
 	if (r) {
 		metadata_operation_failed(pool, "dm_thin_remove_range", r);
 		metadata_operation_failed(pool, "dm_thin_remove_range", r);
 		bio_io_error(m->bio);
 		bio_io_error(m->bio);
+		cell_defer_no_holder(tc, m->cell);
+		mempool_free(m, pool->mapping_pool);
+		return;
+	}
 
 
-	} else if (m->maybe_shared) {
-		passdown_double_checking_shared_status(m);
+	discard_parent = bio_alloc(GFP_NOIO, 1);
+	if (!discard_parent) {
+		DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
+		       dm_device_name(tc->pool->pool_md));
+		queue_passdown_pt2(m);
 
 
 	} else {
 	} else {
-		struct discard_op op;
-		begin_discard(&op, tc, m->bio);
-		r = issue_discard(&op, m->data_block,
-				  m->data_block + (m->virt_end - m->virt_begin));
-		end_discard(&op, r);
+		discard_parent->bi_end_io = passdown_endio;
+		discard_parent->bi_private = m;
+
+		if (m->maybe_shared)
+			passdown_double_checking_shared_status(m, discard_parent);
+		else {
+			struct discard_op op;
+
+			begin_discard(&op, tc, discard_parent);
+			r = issue_discard(&op, m->data_block, data_end);
+			end_discard(&op, r);
+		}
 	}
 	}
 
 
+	/*
+	 * Increment the unmapped blocks.  This prevents a race between the
+	 * passdown io and reallocation of freed blocks.
+	 */
+	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+		bio_io_error(m->bio);
+		cell_defer_no_holder(tc, m->cell);
+		mempool_free(m, pool->mapping_pool);
+		return;
+	}
+}
+
+static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
+
+	/*
+	 * The passdown has completed, so now we can decrement all those
+	 * unmapped blocks.
+	 */
+	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
+				   m->data_block + (m->virt_end - m->virt_begin));
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
+		bio_io_error(m->bio);
+	} else
+		bio_endio(m->bio);
+
 	cell_defer_no_holder(tc, m->cell);
 	cell_defer_no_holder(tc, m->cell);
 	mempool_free(m, pool->mapping_pool);
 	mempool_free(m, pool->mapping_pool);
 }
 }
@@ -2215,6 +2291,8 @@ static void do_worker(struct work_struct *ws)
 	throttle_work_update(&pool->throttle);
 	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	throttle_work_update(&pool->throttle);
 	throttle_work_update(&pool->throttle);
+	process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
+	throttle_work_update(&pool->throttle);
 	process_deferred_bios(pool);
 	process_deferred_bios(pool);
 	throttle_work_complete(&pool->throttle);
 	throttle_work_complete(&pool->throttle);
 }
 }
@@ -2343,7 +2421,8 @@ static void set_discard_callbacks(struct pool *pool)
 
 
 	if (passdown_enabled(pt)) {
 	if (passdown_enabled(pt)) {
 		pool->process_discard_cell = process_discard_cell_passdown;
 		pool->process_discard_cell = process_discard_cell_passdown;
-		pool->process_prepared_discard = process_prepared_discard_passdown;
+		pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
+		pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
 	} else {
 	} else {
 		pool->process_discard_cell = process_discard_cell_no_passdown;
 		pool->process_discard_cell = process_discard_cell_no_passdown;
 		pool->process_prepared_discard = process_prepared_discard_no_passdown;
 		pool->process_prepared_discard = process_prepared_discard_no_passdown;
@@ -2830,6 +2909,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	bio_list_init(&pool->deferred_flush_bios);
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
 	INIT_LIST_HEAD(&pool->prepared_discards);
+	INIT_LIST_HEAD(&pool->prepared_discards_pt2);
 	INIT_LIST_HEAD(&pool->active_thins);
 	INIT_LIST_HEAD(&pool->active_thins);
 	pool->low_water_triggered = false;
 	pool->low_water_triggered = false;
 	pool->suspended = true;
 	pool->suspended = true;

+ 1 - 3
drivers/md/dm-verity-fec.c

@@ -453,9 +453,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	 */
 	 */
 
 
 	offset = block << v->data_dev_block_bits;
 	offset = block << v->data_dev_block_bits;
-
-	res = offset;
-	div64_u64(res, v->fec->rounds << v->data_dev_block_bits);
+	res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits);
 
 
 	/*
 	/*
 	 * The base RS block we can feed to the interleaver to find out all
 	 * The base RS block we can feed to the interleaver to find out all

+ 244 - 1255
drivers/md/dm.c

@@ -5,13 +5,13 @@
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
-#include "dm.h"
+#include "dm-core.h"
+#include "dm-rq.h"
 #include "dm-uevent.h"
 #include "dm-uevent.h"
 
 
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
-#include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/bio.h>
 #include <linux/mempool.h>
 #include <linux/mempool.h>
@@ -20,14 +20,8 @@
 #include <linux/hdreg.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/wait.h>
-#include <linux/kthread.h>
-#include <linux/ktime.h>
-#include <linux/elevator.h> /* for rq_end_sector() */
-#include <linux/blk-mq.h>
 #include <linux/pr.h>
 #include <linux/pr.h>
 
 
-#include <trace/events/block.h>
-
 #define DM_MSG_PREFIX "core"
 #define DM_MSG_PREFIX "core"
 
 
 #ifdef CONFIG_PRINTK
 #ifdef CONFIG_PRINTK
@@ -63,7 +57,6 @@ static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
 static struct workqueue_struct *deferred_remove_workqueue;
 static struct workqueue_struct *deferred_remove_workqueue;
 
 
 /*
 /*
- * For bio-based dm.
  * One of these is allocated per bio.
  * One of these is allocated per bio.
  */
  */
 struct dm_io {
 struct dm_io {
@@ -76,36 +69,6 @@ struct dm_io {
 	struct dm_stats_aux stats_aux;
 	struct dm_stats_aux stats_aux;
 };
 };
 
 
-/*
- * For request-based dm.
- * One of these is allocated per request.
- */
-struct dm_rq_target_io {
-	struct mapped_device *md;
-	struct dm_target *ti;
-	struct request *orig, *clone;
-	struct kthread_work work;
-	int error;
-	union map_info info;
-	struct dm_stats_aux stats_aux;
-	unsigned long duration_jiffies;
-	unsigned n_sectors;
-};
-
-/*
- * For request-based dm - the bio clones we allocate are embedded in these
- * structs.
- *
- * We allocate these with bio_alloc_bioset, using the front_pad parameter when
- * the bioset is created - this means the bio has to come at the end of the
- * struct.
- */
-struct dm_rq_clone_bio_info {
-	struct bio *orig;
-	struct dm_rq_target_io *tio;
-	struct bio clone;
-};
-
 #define MINOR_ALLOCED ((void *)-1)
 #define MINOR_ALLOCED ((void *)-1)
 
 
 /*
 /*
@@ -120,130 +83,9 @@ struct dm_rq_clone_bio_info {
 #define DMF_DEFERRED_REMOVE 6
 #define DMF_DEFERRED_REMOVE 6
 #define DMF_SUSPENDED_INTERNALLY 7
 #define DMF_SUSPENDED_INTERNALLY 7
 
 
-/*
- * Work processed by per-device workqueue.
- */
-struct mapped_device {
-	struct srcu_struct io_barrier;
-	struct mutex suspend_lock;
-
-	/*
-	 * The current mapping (struct dm_table *).
-	 * Use dm_get_live_table{_fast} or take suspend_lock for
-	 * dereference.
-	 */
-	void __rcu *map;
-
-	struct list_head table_devices;
-	struct mutex table_devices_lock;
-
-	unsigned long flags;
-
-	struct request_queue *queue;
-	int numa_node_id;
-
-	unsigned type;
-	/* Protect queue and type against concurrent access. */
-	struct mutex type_lock;
-
-	atomic_t holders;
-	atomic_t open_count;
-
-	struct dm_target *immutable_target;
-	struct target_type *immutable_target_type;
-
-	struct gendisk *disk;
-	char name[16];
-
-	void *interface_ptr;
-
-	/*
-	 * A list of ios that arrived while we were suspended.
-	 */
-	atomic_t pending[2];
-	wait_queue_head_t wait;
-	struct work_struct work;
-	spinlock_t deferred_lock;
-	struct bio_list deferred;
-
-	/*
-	 * Event handling.
-	 */
-	wait_queue_head_t eventq;
-	atomic_t event_nr;
-	atomic_t uevent_seq;
-	struct list_head uevent_list;
-	spinlock_t uevent_lock; /* Protect access to uevent_list */
-
-	/* the number of internal suspends */
-	unsigned internal_suspend_count;
-
-	/*
-	 * Processing queue (flush)
-	 */
-	struct workqueue_struct *wq;
-
-	/*
-	 * io objects are allocated from here.
-	 */
-	mempool_t *io_pool;
-	mempool_t *rq_pool;
-
-	struct bio_set *bs;
-
-	/*
-	 * freeze/thaw support require holding onto a super block
-	 */
-	struct super_block *frozen_sb;
-
-	/* forced geometry settings */
-	struct hd_geometry geometry;
-
-	struct block_device *bdev;
-
-	/* kobject and completion */
-	struct dm_kobject_holder kobj_holder;
-
-	/* zero-length flush that will be cloned and submitted to targets */
-	struct bio flush_bio;
-
-	struct dm_stats stats;
-
-	struct kthread_worker kworker;
-	struct task_struct *kworker_task;
-
-	/* for request-based merge heuristic in dm_request_fn() */
-	unsigned seq_rq_merge_deadline_usecs;
-	int last_rq_rw;
-	sector_t last_rq_pos;
-	ktime_t last_rq_start_time;
-
-	/* for blk-mq request-based DM support */
-	struct blk_mq_tag_set *tag_set;
-	bool use_blk_mq:1;
-	bool init_tio_pdu:1;
-};
-
-#ifdef CONFIG_DM_MQ_DEFAULT
-static bool use_blk_mq = true;
-#else
-static bool use_blk_mq = false;
-#endif
-
-#define DM_MQ_NR_HW_QUEUES 1
-#define DM_MQ_QUEUE_DEPTH 2048
 #define DM_NUMA_NODE NUMA_NO_NODE
 #define DM_NUMA_NODE NUMA_NO_NODE
-
-static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
-static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
 static int dm_numa_node = DM_NUMA_NODE;
 static int dm_numa_node = DM_NUMA_NODE;
 
 
-bool dm_use_blk_mq(struct mapped_device *md)
-{
-	return md->use_blk_mq;
-}
-EXPORT_SYMBOL_GPL(dm_use_blk_mq);
-
 /*
 /*
  * For mempools pre-allocation at the table loading time.
  * For mempools pre-allocation at the table loading time.
  */
  */
@@ -259,9 +101,6 @@ struct table_device {
 	struct dm_dev dm_dev;
 	struct dm_dev dm_dev;
 };
 };
 
 
-#define RESERVED_BIO_BASED_IOS		16
-#define RESERVED_REQUEST_BASED_IOS	256
-#define RESERVED_MAX_IOS		1024
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 static struct kmem_cache *_rq_tio_cache;
 static struct kmem_cache *_rq_cache;
 static struct kmem_cache *_rq_cache;
@@ -269,13 +108,9 @@ static struct kmem_cache *_rq_cache;
 /*
 /*
  * Bio-based DM's mempools' reserved IOs set by the user.
  * Bio-based DM's mempools' reserved IOs set by the user.
  */
  */
+#define RESERVED_BIO_BASED_IOS		16
 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 
 
-/*
- * Request-based DM's mempools' reserved IOs set by the user.
- */
-static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
-
 static int __dm_get_module_param_int(int *module_param, int min, int max)
 static int __dm_get_module_param_int(int *module_param, int min, int max)
 {
 {
 	int param = ACCESS_ONCE(*module_param);
 	int param = ACCESS_ONCE(*module_param);
@@ -297,8 +132,8 @@ static int __dm_get_module_param_int(int *module_param, int min, int max)
 	return param;
 	return param;
 }
 }
 
 
-static unsigned __dm_get_module_param(unsigned *module_param,
-				      unsigned def, unsigned max)
+unsigned __dm_get_module_param(unsigned *module_param,
+			       unsigned def, unsigned max)
 {
 {
 	unsigned param = ACCESS_ONCE(*module_param);
 	unsigned param = ACCESS_ONCE(*module_param);
 	unsigned modified_param = 0;
 	unsigned modified_param = 0;
@@ -319,28 +154,10 @@ static unsigned __dm_get_module_param(unsigned *module_param,
 unsigned dm_get_reserved_bio_based_ios(void)
 unsigned dm_get_reserved_bio_based_ios(void)
 {
 {
 	return __dm_get_module_param(&reserved_bio_based_ios,
 	return __dm_get_module_param(&reserved_bio_based_ios,
-				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
+				     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 }
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
 
-unsigned dm_get_reserved_rq_based_ios(void)
-{
-	return __dm_get_module_param(&reserved_rq_based_ios,
-				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
-}
-EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
-
-static unsigned dm_get_blk_mq_nr_hw_queues(void)
-{
-	return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
-}
-
-static unsigned dm_get_blk_mq_queue_depth(void)
-{
-	return __dm_get_module_param(&dm_mq_queue_depth,
-				     DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
-}
-
 static unsigned dm_get_numa_node(void)
 static unsigned dm_get_numa_node(void)
 {
 {
 	return __dm_get_module_param_int(&dm_numa_node,
 	return __dm_get_module_param_int(&dm_numa_node,
@@ -679,29 +496,7 @@ static void free_tio(struct dm_target_io *tio)
 	bio_put(&tio->clone);
 	bio_put(&tio->clone);
 }
 }
 
 
-static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
-						gfp_t gfp_mask)
-{
-	return mempool_alloc(md->io_pool, gfp_mask);
-}
-
-static void free_old_rq_tio(struct dm_rq_target_io *tio)
-{
-	mempool_free(tio, tio->md->io_pool);
-}
-
-static struct request *alloc_old_clone_request(struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	return mempool_alloc(md->rq_pool, gfp_mask);
-}
-
-static void free_old_clone_request(struct mapped_device *md, struct request *rq)
-{
-	mempool_free(rq, md->rq_pool);
-}
-
-static int md_in_flight(struct mapped_device *md)
+int md_in_flight(struct mapped_device *md)
 {
 {
 	return atomic_read(&md->pending[READ]) +
 	return atomic_read(&md->pending[READ]) +
 	       atomic_read(&md->pending[WRITE]);
 	       atomic_read(&md->pending[WRITE]);
@@ -1019,7 +814,7 @@ static void dec_pending(struct dm_io *io, int error)
 	}
 	}
 }
 }
 
 
-static void disable_write_same(struct mapped_device *md)
+void disable_write_same(struct mapped_device *md)
 {
 {
 	struct queue_limits *limits = dm_get_queue_limits(md);
 	struct queue_limits *limits = dm_get_queue_limits(md);
 
 
@@ -1061,371 +856,6 @@ static void clone_endio(struct bio *bio)
 	dec_pending(io, error);
 	dec_pending(io, error);
 }
 }
 
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-	int error = clone->bi_error;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
-static struct dm_rq_target_io *tio_from_request(struct request *rq)
-{
-	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
-}
-
-static void rq_end_stats(struct mapped_device *md, struct request *orig)
-{
-	if (unlikely(dm_stats_used(&md->stats))) {
-		struct dm_rq_target_io *tio = tio_from_request(orig);
-		tio->duration_jiffies = jiffies - tio->duration_jiffies;
-		dm_stats_account_io(&md->stats, rq_data_dir(orig),
-				    blk_rq_pos(orig), tio->n_sectors, true,
-				    tio->duration_jiffies, &tio->stats_aux);
-	}
-}
-
-/*
- * Don't touch any member of the md after calling this function because
- * the md may be freed in dm_put() at the end of this function.
- * Or do dm_get() before calling this function and dm_put() later.
- */
-static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
-{
-	atomic_dec(&md->pending[rw]);
-
-	/* nudge anyone waiting on suspend queue */
-	if (!md_in_flight(md))
-		wake_up(&md->wait);
-
-	/*
-	 * Run this off this callpath, as drivers could invoke end_io while
-	 * inside their request_fn (and holding the queue lock). Calling
-	 * back into ->request_fn() could deadlock attempting to grab the
-	 * queue lock again.
-	 */
-	if (!md->queue->mq_ops && run_queue)
-		blk_run_queue_async(md->queue);
-
-	/*
-	 * dm_put() must be at the end of this function. See the comment above
-	 */
-	dm_put(md);
-}
-
-static void free_rq_clone(struct request *clone)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-
-	blk_rq_unprep_clone(clone);
-
-	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
-		/* stacked on blk-mq queue(s) */
-		tio->ti->type->release_clone_rq(clone);
-	else if (!md->queue->mq_ops)
-		/* request_fn queue stacked on request_fn queue(s) */
-		free_old_clone_request(md, clone);
-
-	if (!md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
-/*
- * Complete the clone and the original request.
- * Must be called without clone's queue lock held,
- * see end_clone_request() for more details.
- */
-static void dm_end_request(struct request *clone, int error)
-{
-	int rw = rq_data_dir(clone);
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-	struct request *rq = tio->orig;
-
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		rq->errors = clone->errors;
-		rq->resid_len = clone->resid_len;
-
-		if (rq->sense)
-			/*
-			 * We are using the sense buffer of the original
-			 * request.
-			 * So setting the length of the sense data is enough.
-			 */
-			rq->sense_len = clone->sense_len;
-	}
-
-	free_rq_clone(clone);
-	rq_end_stats(md, rq);
-	if (!rq->q->mq_ops)
-		blk_end_request_all(rq, error);
-	else
-		blk_mq_end_request(rq, error);
-	rq_completed(md, rw, true);
-}
-
-static void dm_unprep_request(struct request *rq)
-{
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-	struct request *clone = tio->clone;
-
-	if (!rq->q->mq_ops) {
-		rq->special = NULL;
-		rq->cmd_flags &= ~REQ_DONTPREP;
-	}
-
-	if (clone)
-		free_rq_clone(clone);
-	else if (!tio->md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
-/*
- * Requeue the original request of a clone.
- */
-static void dm_old_requeue_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_requeue_request(q, rq);
-	blk_run_queue_async(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_mq_requeue_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	unsigned long flags;
-
-	blk_mq_requeue_request(rq);
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (!blk_queue_stopped(q))
-		blk_mq_kick_requeue_list(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_requeue_original_request(struct mapped_device *md,
-					struct request *rq)
-{
-	int rw = rq_data_dir(rq);
-
-	rq_end_stats(md, rq);
-	dm_unprep_request(rq);
-
-	if (!rq->q->mq_ops)
-		dm_old_requeue_request(rq);
-	else
-		dm_mq_requeue_request(rq);
-
-	rq_completed(md, rw, false);
-}
-
-static void dm_old_stop_queue(struct request_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_queue_stopped(q)) {
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		return;
-	}
-
-	blk_stop_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_stop_queue(struct request_queue *q)
-{
-	if (!q->mq_ops)
-		dm_old_stop_queue(q);
-	else
-		blk_mq_stop_hw_queues(q);
-}
-
-static void dm_old_start_queue(struct request_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_queue_stopped(q))
-		blk_start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_start_queue(struct request_queue *q)
-{
-	if (!q->mq_ops)
-		dm_old_start_queue(q);
-	else {
-		blk_mq_start_stopped_hw_queues(q, true);
-		blk_mq_kick_requeue_list(q);
-	}
-}
-
-static void dm_done(struct request *clone, int error, bool mapped)
-{
-	int r = error;
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	dm_request_endio_fn rq_end_io = NULL;
-
-	if (tio->ti) {
-		rq_end_io = tio->ti->type->rq_end_io;
-
-		if (mapped && rq_end_io)
-			r = rq_end_io(tio->ti, clone, error, &tio->info);
-	}
-
-	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
-		     !clone->q->limits.max_write_same_sectors))
-		disable_write_same(tio->md);
-
-	if (r <= 0)
-		/* The target wants to complete the I/O */
-		dm_end_request(clone, r);
-	else if (r == DM_ENDIO_INCOMPLETE)
-		/* The target will handle the I/O */
-		return;
-	else if (r == DM_ENDIO_REQUEUE)
-		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(tio->md, tio->orig);
-	else {
-		DMWARN("unimplemented target endio return value: %d", r);
-		BUG();
-	}
-}
-
-/*
- * Request completion handler for request-based dm
- */
-static void dm_softirq_done(struct request *rq)
-{
-	bool mapped = true;
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-	struct request *clone = tio->clone;
-	int rw;
-
-	if (!clone) {
-		rq_end_stats(tio->md, rq);
-		rw = rq_data_dir(rq);
-		if (!rq->q->mq_ops) {
-			blk_end_request_all(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-			free_old_rq_tio(tio);
-		} else {
-			blk_mq_end_request(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-		}
-		return;
-	}
-
-	if (rq->cmd_flags & REQ_FAILED)
-		mapped = false;
-
-	dm_done(clone, tio->error, mapped);
-}
-
-/*
- * Complete the clone and the original request with the error status
- * through softirq context.
- */
-static void dm_complete_request(struct request *rq, int error)
-{
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-
-	tio->error = error;
-	if (!rq->q->mq_ops)
-		blk_complete_request(rq);
-	else
-		blk_mq_complete_request(rq, error);
-}
-
-/*
- * Complete the not-mapped clone and the original request with the error status
- * through softirq context.
- * Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
- */
-static void dm_kill_unmapped_request(struct request *rq, int error)
-{
-	rq->cmd_flags |= REQ_FAILED;
-	dm_complete_request(rq, error);
-}
-
-/*
- * Called with the clone's queue lock held (in the case of .request_fn)
- */
-static void end_clone_request(struct request *clone, int error)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-
-	if (!clone->q->mq_ops) {
-		/*
-		 * For just cleaning up the information of the queue in which
-		 * the clone was dispatched.
-		 * The clone is *NOT* freed actually here because it is alloced
-		 * from dm own mempool (REQ_ALLOCED isn't set).
-		 */
-		__blk_put_request(clone->q, clone);
-	}
-
-	/*
-	 * Actual request completion is done in a softirq context which doesn't
-	 * hold the clone's queue lock.  Otherwise, deadlock could occur because:
-	 *     - another request may be submitted by the upper level driver
-	 *       of the stacking during the completion
-	 *     - the submission which requires queue lock may be done
-	 *       against this clone's queue
-	 */
-	dm_complete_request(tio->orig, error);
-}
-
 /*
 /*
  * Return maximum size of I/O possible at the supplied sector up to the current
  * Return maximum size of I/O possible at the supplied sector up to the current
  * target boundary.
  * target boundary.
@@ -1475,14 +905,41 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
 
-/*
- * A target may call dm_accept_partial_bio only from the map routine.  It is
- * allowed for all bio types except REQ_PREFLUSH.
- *
- * dm_accept_partial_bio informs the dm that the target only wants to process
- * additional n_sectors sectors of the bio and the rest of the data should be
- * sent in a next bio.
- *
+static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+	struct dm_table *map;
+	struct dm_target *ti;
+	int srcu_idx;
+	long len, ret = -EIO;
+
+	map = dm_get_live_table(md, &srcu_idx);
+	if (!map)
+		goto out;
+
+	ti = dm_table_find_target(map, sector);
+	if (!dm_target_is_valid(ti))
+		goto out;
+
+	len = max_io_len(sector, ti) << SECTOR_SHIFT;
+	size = min(len, size);
+
+	if (ti->type->direct_access)
+		ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
+out:
+	dm_put_live_table(md, srcu_idx);
+	return min(ret, size);
+}
+
+/*
+ * A target may call dm_accept_partial_bio only from the map routine.  It is
+ * allowed for all bio types except REQ_PREFLUSH.
+ *
+ * dm_accept_partial_bio informs the dm that the target only wants to process
+ * additional n_sectors sectors of the bio and the rest of the data should be
+ * sent in a next bio.
+ *
  * A diagram that explains the arithmetics:
  * A diagram that explains the arithmetics:
  * +--------------------+---------------+-------+
  * +--------------------+---------------+-------+
  * |         1          |       2       |   3   |
  * |         1          |       2       |   3   |
@@ -1684,512 +1141,165 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
 	return ti->num_write_same_bios;
 	return ti->num_write_same_bios;
 }
 }
 
 
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
-
-static bool is_split_required_for_discard(struct dm_target *ti)
-{
-	return ti->split_discard_bios;
-}
-
-static int __send_changing_extent_only(struct clone_info *ci,
-				       get_num_bios_fn get_num_bios,
-				       is_split_required_fn is_split_required)
-{
-	struct dm_target *ti;
-	unsigned len;
-	unsigned num_bios;
-
-	do {
-		ti = dm_table_find_target(ci->map, ci->sector);
-		if (!dm_target_is_valid(ti))
-			return -EIO;
-
-		/*
-		 * Even though the device advertised support for this type of
-		 * request, that does not mean every target supports it, and
-		 * reconfiguration might also have changed that since the
-		 * check was performed.
-		 */
-		num_bios = get_num_bios ? get_num_bios(ti) : 0;
-		if (!num_bios)
-			return -EOPNOTSUPP;
-
-		if (is_split_required && !is_split_required(ti))
-			len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-		else
-			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
-
-		__send_duplicate_bios(ci, ti, num_bios, &len);
-
-		ci->sector += len;
-	} while (ci->sector_count -= len);
-
-	return 0;
-}
-
-static int __send_discard(struct clone_info *ci)
-{
-	return __send_changing_extent_only(ci, get_num_discard_bios,
-					   is_split_required_for_discard);
-}
-
-static int __send_write_same(struct clone_info *ci)
-{
-	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
-}
-
-/*
- * Select the correct strategy for processing a non-flush bio.
- */
-static int __split_and_process_non_flush(struct clone_info *ci)
-{
-	struct bio *bio = ci->bio;
-	struct dm_target *ti;
-	unsigned len;
-	int r;
-
-	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
-		return __send_discard(ci);
-	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-		return __send_write_same(ci);
-
-	ti = dm_table_find_target(ci->map, ci->sector);
-	if (!dm_target_is_valid(ti))
-		return -EIO;
-
-	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
-
-	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
-	if (r < 0)
-		return r;
-
-	ci->sector += len;
-	ci->sector_count -= len;
-
-	return 0;
-}
-
-/*
- * Entry point to split a bio into clones and submit them to the targets.
- */
-static void __split_and_process_bio(struct mapped_device *md,
-				    struct dm_table *map, struct bio *bio)
-{
-	struct clone_info ci;
-	int error = 0;
-
-	if (unlikely(!map)) {
-		bio_io_error(bio);
-		return;
-	}
-
-	ci.map = map;
-	ci.md = md;
-	ci.io = alloc_io(md);
-	ci.io->error = 0;
-	atomic_set(&ci.io->io_count, 1);
-	ci.io->bio = bio;
-	ci.io->md = md;
-	spin_lock_init(&ci.io->endio_lock);
-	ci.sector = bio->bi_iter.bi_sector;
-
-	start_io_acct(ci.io);
-
-	if (bio->bi_rw & REQ_PREFLUSH) {
-		ci.bio = &ci.md->flush_bio;
-		ci.sector_count = 0;
-		error = __send_empty_flush(&ci);
-		/* dec_pending submits any data associated with flush */
-	} else {
-		ci.bio = bio;
-		ci.sector_count = bio_sectors(bio);
-		while (ci.sector_count && !error)
-			error = __split_and_process_non_flush(&ci);
-	}
-
-	/* drop the extra reference count */
-	dec_pending(ci.io, error);
-}
-/*-----------------------------------------------------------------
- * CRUD END
- *---------------------------------------------------------------*/
-
-/*
- * The request function that just remaps the bio built up by
- * dm_merge_bvec.
- */
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
-{
-	int rw = bio_data_dir(bio);
-	struct mapped_device *md = q->queuedata;
-	int srcu_idx;
-	struct dm_table *map;
-
-	map = dm_get_live_table(md, &srcu_idx);
-
-	generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
-
-	/* if we're suspended, we have to queue this io for later */
-	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
-		dm_put_live_table(md, srcu_idx);
-
-		if (!(bio->bi_rw & REQ_RAHEAD))
-			queue_io(md, bio);
-		else
-			bio_io_error(bio);
-		return BLK_QC_T_NONE;
-	}
-
-	__split_and_process_bio(md, map, bio);
-	dm_put_live_table(md, srcu_idx);
-	return BLK_QC_T_NONE;
-}
-
-int dm_request_based(struct mapped_device *md)
-{
-	return blk_queue_stackable(md->queue);
-}
-
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
-{
-	int r;
-
-	if (blk_queue_io_stat(clone->q))
-		clone->cmd_flags |= REQ_IO_STAT;
-
-	clone->start_time = jiffies;
-	r = blk_insert_cloned_request(clone->q, clone);
-	if (r)
-		/* must complete clone in terms of original request */
-		dm_complete_request(rq, r);
-}
-
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
-{
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
-	clone->end_io = end_clone_request;
-	clone->end_io_data = tio;
-
-	tio->clone = clone;
-
-	return 0;
-}
-
-static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
-				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	/*
-	 * Create clone for use with .request_fn request_queue
-	 */
-	struct request *clone;
-
-	clone = alloc_old_clone_request(md, gfp_mask);
-	if (!clone)
-		return NULL;
-
-	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		free_old_clone_request(md, clone);
-		return NULL;
-	}
-
-	return clone;
-}
-
-static void map_tio_request(struct kthread_work *work);
-
-static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
-		     struct mapped_device *md)
-{
-	tio->md = md;
-	tio->ti = NULL;
-	tio->clone = NULL;
-	tio->orig = rq;
-	tio->error = 0;
-	/*
-	 * Avoid initializing info for blk-mq; it passes
-	 * target-specific data through info.ptr
-	 * (see: dm_mq_init_request)
-	 */
-	if (!md->init_tio_pdu)
-		memset(&tio->info, 0, sizeof(tio->info));
-	if (md->kworker_task)
-		init_kthread_work(&tio->work, map_tio_request);
-}
-
-static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
-					       struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	struct dm_rq_target_io *tio;
-	int srcu_idx;
-	struct dm_table *table;
-
-	tio = alloc_old_rq_tio(md, gfp_mask);
-	if (!tio)
-		return NULL;
-
-	init_tio(tio, rq, md);
-
-	table = dm_get_live_table(md, &srcu_idx);
-	/*
-	 * Must clone a request if this .request_fn DM device
-	 * is stacked on .request_fn device(s).
-	 */
-	if (!dm_table_mq_request_based(table)) {
-		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
-			dm_put_live_table(md, srcu_idx);
-			free_old_rq_tio(tio);
-			return NULL;
-		}
-	}
-	dm_put_live_table(md, srcu_idx);
-
-	return tio;
-}
-
-/*
- * Called with the queue lock held.
- */
-static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_rq_target_io *tio;
-
-	if (unlikely(rq->special)) {
-		DMWARN("Already has something in rq->special.");
-		return BLKPREP_KILL;
-	}
-
-	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
-	if (!tio)
-		return BLKPREP_DEFER;
-
-	rq->special = tio;
-	rq->cmd_flags |= REQ_DONTPREP;
-
-	return BLKPREP_OK;
-}
-
-/*
- * Returns:
- * 0                : the request has been processed
- * DM_MAPIO_REQUEUE : the original request needs to be requeued
- * < 0              : the request was completed due to failure
- */
-static int map_request(struct dm_rq_target_io *tio, struct request *rq,
-		       struct mapped_device *md)
-{
-	int r;
-	struct dm_target *ti = tio->ti;
-	struct request *clone = NULL;
-
-	if (tio->clone) {
-		clone = tio->clone;
-		r = ti->type->map_rq(ti, clone, &tio->info);
-	} else {
-		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
-		if (r < 0) {
-			/* The target wants to complete the I/O */
-			dm_kill_unmapped_request(rq, r);
-			return r;
-		}
-		if (r != DM_MAPIO_REMAPPED)
-			return r;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
-	}
-
-	switch (r) {
-	case DM_MAPIO_SUBMITTED:
-		/* The target has taken the I/O to submit by itself later */
-		break;
-	case DM_MAPIO_REMAPPED:
-		/* The target has remapped the I/O so dispatch it */
-		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
-				     blk_rq_pos(rq));
-		dm_dispatch_clone_request(clone, rq);
-		break;
-	case DM_MAPIO_REQUEUE:
-		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(md, tio->orig);
-		break;
-	default:
-		if (r > 0) {
-			DMWARN("unimplemented target map return value: %d", r);
-			BUG();
-		}
-
-		/* The target wants to complete the I/O */
-		dm_kill_unmapped_request(rq, r);
-		return r;
-	}
+typedef bool (*is_split_required_fn)(struct dm_target *ti);
 
 
-	return 0;
+static bool is_split_required_for_discard(struct dm_target *ti)
+{
+	return ti->split_discard_bios;
 }
 }
 
 
-static void map_tio_request(struct kthread_work *work)
+static int __send_changing_extent_only(struct clone_info *ci,
+				       get_num_bios_fn get_num_bios,
+				       is_split_required_fn is_split_required)
 {
 {
-	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
-	struct request *rq = tio->orig;
-	struct mapped_device *md = tio->md;
+	struct dm_target *ti;
+	unsigned len;
+	unsigned num_bios;
 
 
-	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-		dm_requeue_original_request(md, rq);
-}
+	do {
+		ti = dm_table_find_target(ci->map, ci->sector);
+		if (!dm_target_is_valid(ti))
+			return -EIO;
 
 
-static void dm_start_request(struct mapped_device *md, struct request *orig)
-{
-	if (!orig->q->mq_ops)
-		blk_start_request(orig);
-	else
-		blk_mq_start_request(orig);
-	atomic_inc(&md->pending[rq_data_dir(orig)]);
+		/*
+		 * Even though the device advertised support for this type of
+		 * request, that does not mean every target supports it, and
+		 * reconfiguration might also have changed that since the
+		 * check was performed.
+		 */
+		num_bios = get_num_bios ? get_num_bios(ti) : 0;
+		if (!num_bios)
+			return -EOPNOTSUPP;
 
 
-	if (md->seq_rq_merge_deadline_usecs) {
-		md->last_rq_pos = rq_end_sector(orig);
-		md->last_rq_rw = rq_data_dir(orig);
-		md->last_rq_start_time = ktime_get();
-	}
+		if (is_split_required && !is_split_required(ti))
+			len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+		else
+			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
 
 
-	if (unlikely(dm_stats_used(&md->stats))) {
-		struct dm_rq_target_io *tio = tio_from_request(orig);
-		tio->duration_jiffies = jiffies;
-		tio->n_sectors = blk_rq_sectors(orig);
-		dm_stats_account_io(&md->stats, rq_data_dir(orig),
-				    blk_rq_pos(orig), tio->n_sectors, false, 0,
-				    &tio->stats_aux);
-	}
+		__send_duplicate_bios(ci, ti, num_bios, &len);
 
 
-	/*
-	 * Hold the md reference here for the in-flight I/O.
-	 * We can't rely on the reference count by device opener,
-	 * because the device may be closed during the request completion
-	 * when all bios are completed.
-	 * See the comment in rq_completed() too.
-	 */
-	dm_get(md);
+		ci->sector += len;
+	} while (ci->sector_count -= len);
+
+	return 0;
 }
 }
 
 
-#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+static int __send_discard(struct clone_info *ci)
+{
+	return __send_changing_extent_only(ci, get_num_discard_bios,
+					   is_split_required_for_discard);
+}
 
 
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+static int __send_write_same(struct clone_info *ci)
 {
 {
-	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 }
 
 
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
-						     const char *buf, size_t count)
+/*
+ * Select the correct strategy for processing a non-flush bio.
+ */
+static int __split_and_process_non_flush(struct clone_info *ci)
 {
 {
-	unsigned deadline;
+	struct bio *bio = ci->bio;
+	struct dm_target *ti;
+	unsigned len;
+	int r;
 
 
-	if (!dm_request_based(md) || md->use_blk_mq)
-		return count;
+	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+		return __send_discard(ci);
+	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
+		return __send_write_same(ci);
 
 
-	if (kstrtouint(buf, 10, &deadline))
-		return -EINVAL;
+	ti = dm_table_find_target(ci->map, ci->sector);
+	if (!dm_target_is_valid(ti))
+		return -EIO;
+
+	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
 
 
-	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
-		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+	if (r < 0)
+		return r;
 
 
-	md->seq_rq_merge_deadline_usecs = deadline;
+	ci->sector += len;
+	ci->sector_count -= len;
 
 
-	return count;
+	return 0;
 }
 }
 
 
-static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+/*
+ * Entry point to split a bio into clones and submit them to the targets.
+ */
+static void __split_and_process_bio(struct mapped_device *md,
+				    struct dm_table *map, struct bio *bio)
 {
 {
-	ktime_t kt_deadline;
+	struct clone_info ci;
+	int error = 0;
+
+	if (unlikely(!map)) {
+		bio_io_error(bio);
+		return;
+	}
+
+	ci.map = map;
+	ci.md = md;
+	ci.io = alloc_io(md);
+	ci.io->error = 0;
+	atomic_set(&ci.io->io_count, 1);
+	ci.io->bio = bio;
+	ci.io->md = md;
+	spin_lock_init(&ci.io->endio_lock);
+	ci.sector = bio->bi_iter.bi_sector;
 
 
-	if (!md->seq_rq_merge_deadline_usecs)
-		return false;
+	start_io_acct(ci.io);
 
 
-	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
-	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+	if (bio->bi_rw & REQ_PREFLUSH) {
+		ci.bio = &ci.md->flush_bio;
+		ci.sector_count = 0;
+		error = __send_empty_flush(&ci);
+		/* dec_pending submits any data associated with flush */
+	} else {
+		ci.bio = bio;
+		ci.sector_count = bio_sectors(bio);
+		while (ci.sector_count && !error)
+			error = __split_and_process_non_flush(&ci);
+	}
 
 
-	return !ktime_after(ktime_get(), kt_deadline);
+	/* drop the extra reference count */
+	dec_pending(ci.io, error);
 }
 }
+/*-----------------------------------------------------------------
+ * CRUD END
+ *---------------------------------------------------------------*/
 
 
 /*
 /*
- * q->request_fn for request-based dm.
- * Called with the queue lock held.
+ * The request function that just remaps the bio built up by
+ * dm_merge_bvec.
  */
  */
-static void dm_request_fn(struct request_queue *q)
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 {
 {
+	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	struct mapped_device *md = q->queuedata;
-	struct dm_target *ti = md->immutable_target;
-	struct request *rq;
-	struct dm_rq_target_io *tio;
-	sector_t pos = 0;
-
-	if (unlikely(!ti)) {
-		int srcu_idx;
-		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-
-		ti = dm_table_find_target(map, pos);
-		dm_put_live_table(md, srcu_idx);
-	}
-
-	/*
-	 * For suspend, check blk_queue_stopped() and increment
-	 * ->pending within a single queue_lock not to increment the
-	 * number of in-flight I/Os after the queue is stopped in
-	 * dm_suspend().
-	 */
-	while (!blk_queue_stopped(q)) {
-		rq = blk_peek_request(q);
-		if (!rq)
-			return;
+	int srcu_idx;
+	struct dm_table *map;
 
 
-		/* always use block 0 to find the target for flushes for now */
-		pos = 0;
-		if (req_op(rq) != REQ_OP_FLUSH)
-			pos = blk_rq_pos(rq);
+	map = dm_get_live_table(md, &srcu_idx);
 
 
-		if ((dm_request_peeked_before_merge_deadline(md) &&
-		     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
-		     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
-		    (ti->type->busy && ti->type->busy(ti))) {
-			blk_delay_queue(q, HZ / 100);
-			return;
-		}
+	generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
 
 
-		dm_start_request(md, rq);
+	/* if we're suspended, we have to queue this io for later */
+	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
+		dm_put_live_table(md, srcu_idx);
 
 
-		tio = tio_from_request(rq);
-		/* Establish tio->ti before queuing work (map_tio_request) */
-		tio->ti = ti;
-		queue_kthread_work(&md->kworker, &tio->work);
-		BUG_ON(!irqs_disabled());
+		if (!(bio->bi_rw & REQ_RAHEAD))
+			queue_io(md, bio);
+		else
+			bio_io_error(bio);
+		return BLK_QC_T_NONE;
 	}
 	}
+
+	__split_and_process_bio(md, map, bio);
+	dm_put_live_table(md, srcu_idx);
+	return BLK_QC_T_NONE;
 }
 }
 
 
 static int dm_any_congested(void *congested_data, int bdi_bits)
 static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -2269,7 +1379,7 @@ static const struct block_device_operations dm_blk_dops;
 
 
 static void dm_wq_work(struct work_struct *work);
 static void dm_wq_work(struct work_struct *work);
 
 
-static void dm_init_md_queue(struct mapped_device *md)
+void dm_init_md_queue(struct mapped_device *md)
 {
 {
 	/*
 	/*
 	 * Request-based dm devices cannot be stacked on top of bio-based dm
 	 * Request-based dm devices cannot be stacked on top of bio-based dm
@@ -2290,7 +1400,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->backing_dev_info.congested_data = md;
 	md->queue->backing_dev_info.congested_data = md;
 }
 }
 
 
-static void dm_init_normal_md_queue(struct mapped_device *md)
+void dm_init_normal_md_queue(struct mapped_device *md)
 {
 {
 	md->use_blk_mq = false;
 	md->use_blk_mq = false;
 	dm_init_md_queue(md);
 	dm_init_md_queue(md);
@@ -2330,6 +1440,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		bdput(md->bdev);
 		bdput(md->bdev);
 		md->bdev = NULL;
 		md->bdev = NULL;
 	}
 	}
+
+	dm_mq_cleanup_mapped_device(md);
 }
 }
 
 
 /*
 /*
@@ -2363,7 +1475,7 @@ static struct mapped_device *alloc_dev(int minor)
 		goto bad_io_barrier;
 		goto bad_io_barrier;
 
 
 	md->numa_node_id = numa_node_id;
 	md->numa_node_id = numa_node_id;
-	md->use_blk_mq = use_blk_mq;
+	md->use_blk_mq = dm_use_blk_mq_default();
 	md->init_tio_pdu = false;
 	md->init_tio_pdu = false;
 	md->type = DM_TYPE_NONE;
 	md->type = DM_TYPE_NONE;
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->suspend_lock);
@@ -2448,10 +1560,6 @@ static void free_dev(struct mapped_device *md)
 	unlock_fs(md);
 	unlock_fs(md);
 
 
 	cleanup_mapped_device(md);
 	cleanup_mapped_device(md);
-	if (md->tag_set) {
-		blk_mq_free_tag_set(md->tag_set);
-		kfree(md->tag_set);
-	}
 
 
 	free_table_devices(&md->table_devices);
 	free_table_devices(&md->table_devices);
 	dm_stats_cleanup(&md->stats);
 	dm_stats_cleanup(&md->stats);
@@ -2467,7 +1575,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 
 
 	if (md->bs) {
 	if (md->bs) {
 		/* The md already has necessary mempools. */
 		/* The md already has necessary mempools. */
-		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+		if (dm_table_bio_based(t)) {
 			/*
 			/*
 			 * Reload bioset because front_pad may have changed
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
 			 * because a different table was loaded.
@@ -2657,176 +1765,15 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
 
-static void dm_old_init_rq_based_worker_thread(struct mapped_device *md)
-{
-	/* Initialize the request-based DM worker thread */
-	init_kthread_worker(&md->kworker);
-	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-				       "kdmwork-%s", dm_device_name(md));
-}
-
-/*
- * Fully initialize a .request_fn request-based queue.
- */
-static int dm_old_init_request_queue(struct mapped_device *md)
-{
-	/* Fully initialize the queue */
-	if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL))
-		return -EINVAL;
-
-	/* disable dm_request_fn's merge heuristic by default */
-	md->seq_rq_merge_deadline_usecs = 0;
-
-	dm_init_normal_md_queue(md);
-	blk_queue_softirq_done(md->queue, dm_softirq_done);
-	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
-
-	dm_old_init_rq_based_worker_thread(md);
-
-	elv_register_queue(md->queue);
-
-	return 0;
-}
-
-static int dm_mq_init_request(void *data, struct request *rq,
-			      unsigned int hctx_idx, unsigned int request_idx,
-			      unsigned int numa_node)
-{
-	struct mapped_device *md = data;
-	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-
-	/*
-	 * Must initialize md member of tio, otherwise it won't
-	 * be available in dm_mq_queue_rq.
-	 */
-	tio->md = md;
-
-	if (md->init_tio_pdu) {
-		/* target-specific per-io data is immediately after the tio */
-		tio->info.ptr = tio + 1;
-	}
-
-	return 0;
-}
-
-static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
-			  const struct blk_mq_queue_data *bd)
-{
-	struct request *rq = bd->rq;
-	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-	struct mapped_device *md = tio->md;
-	struct dm_target *ti = md->immutable_target;
-
-	if (unlikely(!ti)) {
-		int srcu_idx;
-		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-
-		ti = dm_table_find_target(map, 0);
-		dm_put_live_table(md, srcu_idx);
-	}
-
-	if (ti->type->busy && ti->type->busy(ti))
-		return BLK_MQ_RQ_QUEUE_BUSY;
-
-	dm_start_request(md, rq);
-
-	/* Init tio using md established in .init_request */
-	init_tio(tio, rq, md);
-
-	/*
-	 * Establish tio->ti before queuing work (map_tio_request)
-	 * or making direct call to map_request().
-	 */
-	tio->ti = ti;
-
-	/* Direct call is fine since .queue_rq allows allocations */
-	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
-		/* Undo dm_start_request() before requeuing */
-		rq_end_stats(md, rq);
-		rq_completed(md, rq_data_dir(rq), false);
-		return BLK_MQ_RQ_QUEUE_BUSY;
-	}
-
-	return BLK_MQ_RQ_QUEUE_OK;
-}
-
-static struct blk_mq_ops dm_mq_ops = {
-	.queue_rq = dm_mq_queue_rq,
-	.map_queue = blk_mq_map_queue,
-	.complete = dm_softirq_done,
-	.init_request = dm_mq_init_request,
-};
-
-static int dm_mq_init_request_queue(struct mapped_device *md,
-				    struct dm_target *immutable_tgt)
-{
-	struct request_queue *q;
-	int err;
-
-	if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
-		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
-		return -EINVAL;
-	}
-
-	md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
-	if (!md->tag_set)
-		return -ENOMEM;
-
-	md->tag_set->ops = &dm_mq_ops;
-	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
-	md->tag_set->numa_node = md->numa_node_id;
-	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
-	md->tag_set->driver_data = md;
-
-	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
-	if (immutable_tgt && immutable_tgt->per_io_data_size) {
-		/* any target-specific per-io data is immediately after the tio */
-		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
-		md->init_tio_pdu = true;
-	}
-
-	err = blk_mq_alloc_tag_set(md->tag_set);
-	if (err)
-		goto out_kfree_tag_set;
-
-	q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
-	if (IS_ERR(q)) {
-		err = PTR_ERR(q);
-		goto out_tag_set;
-	}
-	dm_init_md_queue(md);
-
-	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	blk_mq_register_disk(md->disk);
-
-	return 0;
-
-out_tag_set:
-	blk_mq_free_tag_set(md->tag_set);
-out_kfree_tag_set:
-	kfree(md->tag_set);
-
-	return err;
-}
-
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 /*
 /*
  * Setup the DM device's queue based on md's type
  * Setup the DM device's queue based on md's type
  */
  */
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 {
 	int r;
 	int r;
-	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
+	unsigned type = dm_get_md_type(md);
 
 
-	switch (md_type) {
+	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
 	case DM_TYPE_REQUEST_BASED:
 		r = dm_old_init_request_queue(md);
 		r = dm_old_init_request_queue(md);
 		if (r) {
 		if (r) {
@@ -2835,13 +1782,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		}
 		}
 		break;
 		break;
 	case DM_TYPE_MQ_REQUEST_BASED:
 	case DM_TYPE_MQ_REQUEST_BASED:
-		r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t));
+		r = dm_mq_init_request_queue(md, t);
 		if (r) {
 		if (r) {
 			DMERR("Cannot initialize queue for request-based dm-mq mapped device");
 			DMERR("Cannot initialize queue for request-based dm-mq mapped device");
 			return r;
 			return r;
 		}
 		}
 		break;
 		break;
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_BIO_BASED:
+	case DM_TYPE_DAX_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		blk_queue_make_request(md->queue, dm_make_request);
 		/*
 		/*
@@ -2850,6 +1798,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		 */
 		 */
 		bioset_free(md->queue->bio_split);
 		bioset_free(md->queue->bio_split);
 		md->queue->bio_split = NULL;
 		md->queue->bio_split = NULL;
+
+		if (type == DM_TYPE_DAX_BIO_BASED)
+			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
 		break;
 		break;
 	}
 	}
 
 
@@ -3544,10 +2495,9 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 	if (!pools)
 	if (!pools)
 		return NULL;
 		return NULL;
 
 
-	type = filter_md_type(type, md);
-
 	switch (type) {
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_BIO_BASED:
+	case DM_TYPE_DAX_BIO_BASED:
 		cachep = _io_cache;
 		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
 		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
@@ -3604,26 +2554,76 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 	kfree(pools);
 	kfree(pools);
 }
 }
 
 
-static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
-			  u32 flags)
+struct dm_pr {
+	u64	old_key;
+	u64	new_key;
+	u32	flags;
+	bool	fail_early;
+};
+
+static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
+		      void *data)
 {
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	struct mapped_device *md = bdev->bd_disk->private_data;
-	const struct pr_ops *ops;
-	fmode_t mode;
-	int r;
+	struct dm_table *table;
+	struct dm_target *ti;
+	int ret = -ENOTTY, srcu_idx;
 
 
-	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
-	if (r < 0)
-		return r;
+	table = dm_get_live_table(md, &srcu_idx);
+	if (!table || !dm_table_get_size(table))
+		goto out;
 
 
-	ops = bdev->bd_disk->fops->pr_ops;
-	if (ops && ops->pr_register)
-		r = ops->pr_register(bdev, old_key, new_key, flags);
-	else
-		r = -EOPNOTSUPP;
+	/* We only support devices that have a single target */
+	if (dm_table_get_num_targets(table) != 1)
+		goto out;
+	ti = dm_table_get_target(table, 0);
 
 
-	bdput(bdev);
-	return r;
+	ret = -EINVAL;
+	if (!ti->type->iterate_devices)
+		goto out;
+
+	ret = ti->type->iterate_devices(ti, fn, data);
+out:
+	dm_put_live_table(md, srcu_idx);
+	return ret;
+}
+
+/*
+ * For register / unregister we need to manually call out to every path.
+ */
+static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
+			    sector_t start, sector_t len, void *data)
+{
+	struct dm_pr *pr = data;
+	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+	if (!ops || !ops->pr_register)
+		return -EOPNOTSUPP;
+	return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
+}
+
+static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
+			  u32 flags)
+{
+	struct dm_pr pr = {
+		.old_key	= old_key,
+		.new_key	= new_key,
+		.flags		= flags,
+		.fail_early	= true,
+	};
+	int ret;
+
+	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
+	if (ret && new_key) {
+		/* unregister all paths if we failed to register any path */
+		pr.old_key = new_key;
+		pr.new_key = 0;
+		pr.flags = 0;
+		pr.fail_early = false;
+		dm_call_pr(bdev, __dm_pr_register, &pr);
+	}
+
+	return ret;
 }
 }
 
 
 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
@@ -3724,6 +2724,7 @@ static const struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
 	.ioctl = dm_blk_ioctl,
+	.direct_access = dm_blk_direct_access,
 	.getgeo = dm_blk_getgeo,
 	.getgeo = dm_blk_getgeo,
 	.pr_ops = &dm_pr_ops,
 	.pr_ops = &dm_pr_ops,
 	.owner = THIS_MODULE
 	.owner = THIS_MODULE
@@ -3741,18 +2742,6 @@ MODULE_PARM_DESC(major, "The major number of the device mapper");
 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 
 
-module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
-
-module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
-
-module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
-
-module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
-
 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
 
 

+ 4 - 32
drivers/md/dm.h

@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/list.h>
+#include <linux/moduleparam.h>
 #include <linux/blkdev.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/hdreg.h>
@@ -32,14 +33,6 @@
  */
  */
 #define DM_STATUS_NOFLUSH_FLAG		(1 << 0)
 #define DM_STATUS_NOFLUSH_FLAG		(1 << 0)
 
 
-/*
- * Type of table and mapped_device's mempool
- */
-#define DM_TYPE_NONE			0
-#define DM_TYPE_BIO_BASED		1
-#define DM_TYPE_REQUEST_BASED		2
-#define DM_TYPE_MQ_REQUEST_BASED	3
-
 /*
 /*
  * List of devices that a metadevice uses and should open/close.
  * List of devices that a metadevice uses and should open/close.
  */
  */
@@ -75,8 +68,9 @@ unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
+bool dm_table_bio_based(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-bool dm_table_mq_request_based(struct dm_table *t);
+bool dm_table_all_blk_mq_devices(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
 
@@ -161,16 +155,6 @@ void dm_interface_exit(void);
 /*
 /*
  * sysfs interface
  * sysfs interface
  */
  */
-struct dm_kobject_holder {
-	struct kobject kobj;
-	struct completion completion;
-};
-
-static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
-{
-	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
-}
-
 int dm_sysfs_init(struct mapped_device *md);
 int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
@@ -212,8 +196,6 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
 
 
-bool dm_use_blk_mq(struct mapped_device *md);
-
 int dm_io_init(void);
 int dm_io_init(void);
 void dm_io_exit(void);
 void dm_io_exit(void);
 
 
@@ -228,18 +210,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 
 /*
 /*
- * Helpers that are used by DM core
+ * Various helpers
  */
  */
 unsigned dm_get_reserved_bio_based_ios(void);
 unsigned dm_get_reserved_bio_based_ios(void);
-unsigned dm_get_reserved_rq_based_ios(void);
-
-static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
-{
-	return !maxlen || strlen(result) + 1 >= maxlen;
-}
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
-						     const char *buf, size_t count);
 
 
 #endif
 #endif

+ 8 - 1
drivers/md/persistent-data/dm-btree.c

@@ -429,7 +429,14 @@ static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t ro
 
 
 	if (flags & INTERNAL_NODE) {
 	if (flags & INTERNAL_NODE) {
 		i = lower_bound(n, key);
 		i = lower_bound(n, key);
-		if (i < 0 || i >= nr_entries) {
+		if (i < 0) {
+			/*
+			 * avoid early -ENODATA return when all entries are
+			 * higher than the search @key.
+			 */
+			i = 0;
+		}
+		if (i >= nr_entries) {
 			r = -ENODATA;
 			r = -ENODATA;
 			goto out;
 			goto out;
 		}
 		}

+ 1 - 2
drivers/scsi/sd.c

@@ -1619,8 +1619,7 @@ static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
 	return sd_pr_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
 	return sd_pr_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
 			old_key, new_key, 0,
 			old_key, new_key, 0,
-			(1 << 0) /* APTPL */ |
-			(1 << 2) /* ALL_TG_PT */);
+			(1 << 0) /* APTPL */);
 }
 }
 
 
 static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
 static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,

+ 26 - 0
include/linux/device-mapper.h

@@ -19,6 +19,15 @@ struct dm_table;
 struct mapped_device;
 struct mapped_device;
 struct bio_vec;
 struct bio_vec;
 
 
+/*
+ * Type of table, mapped_device's mempool and request_queue
+ */
+#define DM_TYPE_NONE			0
+#define DM_TYPE_BIO_BASED		1
+#define DM_TYPE_REQUEST_BASED		2
+#define DM_TYPE_MQ_REQUEST_BASED	3
+#define DM_TYPE_DAX_BIO_BASED		4
+
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
 
 union map_info {
 union map_info {
@@ -116,6 +125,14 @@ typedef void (*dm_io_hints_fn) (struct dm_target *ti,
  */
  */
 typedef int (*dm_busy_fn) (struct dm_target *ti);
 typedef int (*dm_busy_fn) (struct dm_target *ti);
 
 
+/*
+ * Returns:
+ *  < 0 : error
+ * >= 0 : the number of bytes accessible at the address
+ */
+typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
+				     void __pmem **kaddr, pfn_t *pfn, long size);
+
 void dm_error(const char *message);
 void dm_error(const char *message);
 
 
 struct dm_dev {
 struct dm_dev {
@@ -162,6 +179,7 @@ struct target_type {
 	dm_busy_fn busy;
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
 	dm_io_hints_fn io_hints;
+	dm_direct_access_fn direct_access;
 
 
 	/* For internal device-mapper use. */
 	/* For internal device-mapper use. */
 	struct list_head list;
 	struct list_head list;
@@ -443,6 +461,14 @@ int dm_table_add_target(struct dm_table *t, const char *type,
  */
  */
 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
 
 
+/*
+ * Target can use this to set the table's type.
+ * Can only ever be called from a target's ctr.
+ * Useful for "hybrid" target (supports both bio-based
+ * and request-based).
+ */
+void dm_table_set_type(struct dm_table *t, unsigned type);
+
 /*
 /*
  * Finally call this to make the table ready for use.
  * Finally call this to make the table ready for use.
  */
  */

+ 2 - 2
include/uapi/linux/dm-ioctl.h

@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 
 #define DM_VERSION_MAJOR	4
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	34
+#define DM_VERSION_MINOR	35
 #define DM_VERSION_PATCHLEVEL	0
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2015-10-28)"
+#define DM_VERSION_EXTRA	"-ioctl (2016-06-23)"
 
 
 /* Status bits */
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */