16 年之前 · c3cb5e1939
--- a/Documentation/device-mapper/dm-log.txt
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
 
				+Device-Mapper Logging
			
 
				+=====================
			
 
				+The device-mapper logging code is used by some of the device-mapper
			
 
				+RAID targets to track regions of the disk that are not consistent.
			
 
				+A region (or portion of the address space) of the disk may be
			
 
				+inconsistent because a RAID stripe is currently being operated on or
			
 
				+a machine died while the region was being altered.  In the case of
			
 
				+mirrors, a region would be considered dirty/inconsistent while you
			
 
				+are writing to it because the writes need to be replicated for all
			
 
				+the legs of the mirror and may not reach the legs at the same time.
			
 
				+Once all writes are complete, the region is considered clean again.
			
 
				+
			
 
				+There is a generic logging interface that the device-mapper RAID
			
 
				+implementations use to perform logging operations (see
			
 
				+dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
			
 
				+logging implementations are available and provide different
			
 
				+capabilities.  The list includes:
			
 
				+
			
 
				+Type		Files
			
 
				+====		=====
			
 
				+disk		drivers/md/dm-log.c
			
 
				+core		drivers/md/dm-log.c
			
 
				+userspace	drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
			
 
				+
			
 
				+The "disk" log type
			
 
				+-------------------
			
 
				+This log implementation commits the log state to disk.  This way, the
			
 
				+logging state survives reboots/crashes.
			
 
				+
			
 
				+The "core" log type
			
 
				+-------------------
			
 
				+This log implementation keeps the log state in memory.  The log state
			
 
				+will not survive a reboot or crash, but there may be a small boost in
			
 
				+performance.  This method can also be used if no storage device is
			
 
				+available for storing log state.
			
 
				+
			
 
				+The "userspace" log type
			
 
				+------------------------
			
 
				+This log type simply provides a way to export the log API to userspace,
			
 
				+so log implementations can be done there.  This is done by forwarding most
			
 
				+logging requests to userspace, where a daemon receives and processes the
			
 
				+request.
			
 
				+
			
 
				+The structure used for communication between kernel and userspace are
			
 
				+located in include/linux/dm-log-userspace.h.  Due to the frequency,
			
 
				+diversity, and 2-way communication nature of the exchanges between
			
 
				+kernel and userspace, 'connector' is used as the interface for
			
 
				+communication.
			
 
				+
			
 
				+There are currently two userspace log implementations that leverage this
			
 
				+framework - "clustered_disk" and "clustered_core".  These implementations
			
 
				+provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
			
 
				+can be used in a shared-storage environment when the cluster log implementations
			
 
				+are employed.
			
--- a/Documentation/device-mapper/dm-queue-length.txt
+++ b/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
 
				+dm-queue-length
			
 
				+===============
			
 
				+
			
 
				+dm-queue-length is a path selector module for device-mapper targets,
			
 
				+which selects a path with the least number of in-flight I/Os.
			
 
				+The path selector name is 'queue-length'.
			
 
				+
			
 
				+Table parameters for each path: [<repeat_count>]
			
 
				+	<repeat_count>: The number of I/Os to dispatch using the selected
			
 
				+			path before switching to the next path.
			
 
				+			If not given, internal default is used. To check
			
 
				+			the default value, see the activated table.
			
 
				+
			
 
				+Status for each path: <status> <fail-count> <in-flight>
			
 
				+	<status>: 'A' if the path is active, 'F' if the path is failed.
			
 
				+	<fail-count>: The number of path failures.
			
 
				+	<in-flight>: The number of in-flight I/Os on the path.
			
 
				+
			
 
				+
			
 
				+Algorithm
			
 
				+=========
			
 
				+
			
 
				+dm-queue-length increments/decrements 'in-flight' when an I/O is
			
 
				+dispatched/completed respectively.
			
 
				+dm-queue-length selects a path with the minimum 'in-flight'.
			
 
				+
			
 
				+
			
 
				+Examples
			
 
				+========
			
 
				+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
			
 
				+
			
 
				+# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
			
 
				+  dmsetup create test
			
 
				+#
			
 
				+# dmsetup table
			
 
				+test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
			
 
				+#
			
 
				+# dmsetup status
			
 
				+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
			
--- a/Documentation/device-mapper/dm-service-time.txt
+++ b/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,91 @@
 
				+dm-service-time
			
 
				+===============
			
 
				+
			
 
				+dm-service-time is a path selector module for device-mapper targets,
			
 
				+which selects a path with the shortest estimated service time for
			
 
				+the incoming I/O.
			
 
				+
			
 
				+The service time for each path is estimated by dividing the total size
			
 
				+of in-flight I/Os on a path with the performance value of the path.
			
 
				+The performance value is a relative throughput value among all paths
			
 
				+in a path-group, and it can be specified as a table argument.
			
 
				+
			
 
				+The path selector name is 'service-time'.
			
 
				+
			
 
				+Table parameters for each path: [<repeat_count> [<relative_throughput>]]
			
 
				+	<repeat_count>: The number of I/Os to dispatch using the selected
			
 
				+			path before switching to the next path.
			
 
				+			If not given, internal default is used.  To check
			
 
				+			the default value, see the activated table.
			
 
				+	<relative_throughput>: The relative throughput value of the path
			
 
				+			among all paths in the path-group.
			
 
				+			The valid range is 0-100.
			
 
				+			If not given, minimum value '1' is used.
			
 
				+			If '0' is given, the path isn't selected while
			
 
				+			other paths having a positive value are available.
			
 
				+
			
 
				+Status for each path: <status> <fail-count> <in-flight-size> \
			
 
				+		      <relative_throughput>
			
 
				+	<status>: 'A' if the path is active, 'F' if the path is failed.
			
 
				+	<fail-count>: The number of path failures.
			
 
				+	<in-flight-size>: The size of in-flight I/Os on the path.
			
 
				+	<relative_throughput>: The relative throughput value of the path
			
 
				+			among all paths in the path-group.
			
 
				+
			
 
				+
			
 
				+Algorithm
			
 
				+=========
			
 
				+
			
 
				+dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
			
 
				+dispatched and substracts when completed.
			
 
				+Basically, dm-service-time selects a path having minimum service time
			
 
				+which is calculated by:
			
 
				+
			
 
				+	('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
			
 
				+
			
 
				+However, some optimizations below are used to reduce the calculation
			
 
				+as much as possible.
			
 
				+
			
 
				+	1. If the paths have the same 'relative_throughput', skip
			
 
				+	   the division and just compare the 'in-flight-size'.
			
 
				+
			
 
				+	2. If the paths have the same 'in-flight-size', skip the division
			
 
				+	   and just compare the 'relative_throughput'.
			
 
				+
			
 
				+	3. If some paths have non-zero 'relative_throughput' and others
			
 
				+	   have zero 'relative_throughput', ignore those paths with zero
			
 
				+	   'relative_throughput'.
			
 
				+
			
 
				+If such optimizations can't be applied, calculate service time, and
			
 
				+compare service time.
			
 
				+If calculated service time is equal, the path having maximum
			
 
				+'relative_throughput' may be better.  So compare 'relative_throughput'
			
 
				+then.
			
 
				+
			
 
				+
			
 
				+Examples
			
 
				+========
			
 
				+In case that 2 paths (sda and sdb) are used with repeat_count == 128
			
 
				+and sda has an average throughput 1GB/s and sdb has 4GB/s,
			
 
				+'relative_throughput' value may be '1' for sda and '4' for sdb.
			
 
				+
			
 
				+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
			
 
				+  dmsetup create test
			
 
				+#
			
 
				+# dmsetup table
			
 
				+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
			
 
				+#
			
 
				+# dmsetup status
			
 
				+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
			
 
				+
			
 
				+
			
 
				+Or '2' for sda and '8' for sdb would be also true.
			
 
				+
			
 
				+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
			
 
				+  dmsetup create test
			
 
				+#
			
 
				+# dmsetup table
			
 
				+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
			
 
				+#
			
 
				+# dmsetup status
			
 
				+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
			
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,6 +231,17 @@ config DM_MIRROR
 
				          Allow volume managers to mirror logical volumes, also
			
 
				          needed for live data migration tools such as 'pvmove'.
			
 
				 
			
 
				+config DM_LOG_USERSPACE
			
 
				+	tristate "Mirror userspace logging (EXPERIMENTAL)"
			
 
				+	depends on DM_MIRROR && EXPERIMENTAL && NET
			
 
				+	select CONNECTOR
			
 
				+	---help---
			
 
				+	  The userspace logging module provides a mechanism for
			
 
				+	  relaying the dm-dirty-log API to userspace.  Log designs
			
 
				+	  which are more suited to userspace implementation (e.g.
			
 
				+	  shared storage logs) or experimental logs can be implemented
			
 
				+	  by leveraging this framework.
			
 
				+
			
 
				 config DM_ZERO
			
 
				 	tristate "Zero target"
			
 
				 	depends on BLK_DEV_DM
			
@@ -249,6 +260,25 @@ config DM_MULTIPATH
 
				 	---help---
			
 
				 	  Allow volume managers to support multipath hardware.
			
 
				 
			
 
				+config DM_MULTIPATH_QL
			
 
				+	tristate "I/O Path Selector based on the number of in-flight I/Os"
			
 
				+	depends on DM_MULTIPATH
			
 
				+	---help---
			
 
				+	  This path selector is a dynamic load balancer which selects
			
 
				+	  the path with the least number of in-flight I/Os.
			
 
				+
			
 
				+	  If unsure, say N.
			
 
				+
			
 
				+config DM_MULTIPATH_ST
			
 
				+	tristate "I/O Path Selector based on the service time"
			
 
				+	depends on DM_MULTIPATH
			
 
				+	---help---
			
 
				+	  This path selector is a dynamic load balancer which selects
			
 
				+	  the path expected to complete the incoming I/O in the shortest
			
 
				+	  time.
			
 
				+
			
 
				+	  If unsure, say N.
			
 
				+
			
 
				 config DM_DELAY
			
 
				 	tristate "I/O delaying target (EXPERIMENTAL)"
			
 
				 	depends on BLK_DEV_DM && EXPERIMENTAL
			
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 
				 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
			
 
				 		    dm-snap-persistent.o
			
 
				 dm-mirror-y	+= dm-raid1.o
			
 
				+dm-log-userspace-y \
			
 
				+		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
			
 
				 md-mod-y	+= md.o bitmap.o
			
 
				 raid456-y	+= raid5.o
			
 
				 raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
			
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 
				 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
			
 
				 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
			
 
				 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
			
 
				+obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
			
 
				+obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
			
 
				 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
			
 
				 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
			
 
				+obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
			
 
				 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
			
 
				 
			
 
				 quiet_cmd_unroll = UNROLL  $@
			
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto bad_crypt_queue;
			
 
				 	}
			
 
				 
			
 
				+	ti->num_flush_requests = 1;
			
 
				 	ti->private = cc;
			
 
				 	return 0;
			
 
				 
			
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
				 		     union map_info *map_context)
			
 
				 {
			
 
				 	struct dm_crypt_io *io;
			
 
				+	struct crypt_config *cc;
			
 
				+
			
 
				+	if (unlikely(bio_empty_barrier(bio))) {
			
 
				+		cc = ti->private;
			
 
				+		bio->bi_bdev = cc->dev->bdev;
			
 
				+		return DM_MAPIO_REMAPPED;
			
 
				+	}
			
 
				 
			
 
				 	io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
			
 
				 
			
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
				 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
			
 
				 }
			
 
				 
			
 
				+static int crypt_iterate_devices(struct dm_target *ti,
			
 
				+				 iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct crypt_config *cc = ti->private;
			
 
				+
			
 
				+	return fn(ti, cc->dev, cc->start, data);
			
 
				+}
			
 
				+
			
 
				 static struct target_type crypt_target = {
			
 
				 	.name   = "crypt",
			
 
				-	.version= {1, 6, 0},
			
 
				+	.version = {1, 7, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = crypt_ctr,
			
 
				 	.dtr    = crypt_dtr,
			
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = {
 
				 	.resume = crypt_resume,
			
 
				 	.message = crypt_message,
			
 
				 	.merge  = crypt_merge,
			
 
				+	.iterate_devices = crypt_iterate_devices,
			
 
				 };
			
 
				 
			
 
				 static int __init dm_crypt_init(void)
			
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	mutex_init(&dc->timer_lock);
			
 
				 	atomic_set(&dc->may_delay, 1);
			
 
				 
			
 
				+	ti->num_flush_requests = 1;
			
 
				 	ti->private = dc;
			
 
				 	return 0;
			
 
				 
			
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
 
				 
			
 
				 	if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
			
 
				 		bio->bi_bdev = dc->dev_write->bdev;
			
 
				-		bio->bi_sector = dc->start_write +
			
 
				-				 (bio->bi_sector - ti->begin);
			
 
				+		if (bio_sectors(bio))
			
 
				+			bio->bi_sector = dc->start_write +
			
 
				+					 (bio->bi_sector - ti->begin);
			
 
				 
			
 
				 		return delay_bio(dc, dc->write_delay, bio);
			
 
				 	}
			
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int delay_iterate_devices(struct dm_target *ti,
			
 
				+				 iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct delay_c *dc = ti->private;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	ret = fn(ti, dc->dev_read, dc->start_read, data);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (dc->dev_write)
			
 
				+		ret = fn(ti, dc->dev_write, dc->start_write, data);
			
 
				+
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static struct target_type delay_target = {
			
 
				 	.name	     = "delay",
			
 
				-	.version     = {1, 0, 2},
			
 
				+	.version     = {1, 1, 0},
			
 
				 	.module      = THIS_MODULE,
			
 
				 	.ctr	     = delay_ctr,
			
 
				 	.dtr	     = delay_dtr,
			
@@ -326,6 +345,7 @@ static struct target_type delay_target = {
 
				 	.presuspend  = delay_presuspend,
			
 
				 	.resume	     = delay_resume,
			
 
				 	.status	     = delay_status,
			
 
				+	.iterate_devices = delay_iterate_devices,
			
 
				 };
			
 
				 
			
 
				 static int __init dm_delay_init(void)
			
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	type = get_type(argv[1]);
			
 
				+	type = get_type(&persistent);
			
 
				 	if (!type) {
			
 
				 		ti->error = "Exception store type not recognised";
			
 
				 		r = -EINVAL;
			
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 
				  */
			
 
				 static inline sector_t get_dev_size(struct block_device *bdev)
			
 
				 {
			
 
				-	return bdev->bd_inode->i_size >> SECTOR_SHIFT;
			
 
				+	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
			
 
				 }
			
 
				 
			
 
				 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
			
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
 
				 /* FIXME: can we shrink this ? */
			
 
				 struct io {
			
 
				 	unsigned long error_bits;
			
 
				+	unsigned long eopnotsupp_bits;
			
 
				 	atomic_t count;
			
 
				 	struct task_struct *sleeper;
			
 
				 	struct dm_io_client *client;
			
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
 
				  *---------------------------------------------------------------*/
			
 
				 static void dec_count(struct io *io, unsigned int region, int error)
			
 
				 {
			
 
				-	if (error)
			
 
				+	if (error) {
			
 
				 		set_bit(region, &io->error_bits);
			
 
				+		if (error == -EOPNOTSUPP)
			
 
				+			set_bit(region, &io->eopnotsupp_bits);
			
 
				+	}
			
 
				 
			
 
				 	if (atomic_dec_and_test(&io->count)) {
			
 
				 		if (io->sleeper)
			
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 
				 		return -EIO;
			
 
				 	}
			
 
				 
			
 
				+retry:
			
 
				 	io.error_bits = 0;
			
 
				+	io.eopnotsupp_bits = 0;
			
 
				 	atomic_set(&io.count, 1); /* see dispatch_io() */
			
 
				 	io.sleeper = current;
			
 
				 	io.client = client;
			
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 
				 	}
			
 
				 	set_current_state(TASK_RUNNING);
			
 
				 
			
 
				+	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
			
 
				+		rw &= ~(1 << BIO_RW_BARRIER);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				 	if (error_bits)
			
 
				 		*error_bits = io.error_bits;
			
 
				 
			
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 
				 
			
 
				 	io = mempool_alloc(client->pool, GFP_NOIO);
			
 
				 	io->error_bits = 0;
			
 
				+	io->eopnotsupp_bits = 0;
			
 
				 	atomic_set(&io->count, 1); /* see dispatch_io() */
			
 
				 	io->sleeper = NULL;
			
 
				 	io->client = client;
			
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ static void dm_hash_remove_all(int keep_open_devices)
 
				 	up_write(&_hash_lock);
			
 
				 }
			
 
				 
			
 
				-static int dm_hash_rename(const char *old, const char *new)
			
 
				+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
			
 
				 {
			
 
				 	char *new_name, *old_name;
			
 
				 	struct hash_cell *hc;
			
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
 
				 		dm_table_put(table);
			
 
				 	}
			
 
				 
			
 
				-	dm_kobject_uevent(hc->md);
			
 
				+	dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
			
 
				 
			
 
				 	dm_put(hc->md);
			
 
				 	up_write(&_hash_lock);
			
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 
				 
			
 
				 	__hash_remove(hc);
			
 
				 	up_write(&_hash_lock);
			
 
				+
			
 
				+	dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
			
 
				+
			
 
				 	dm_put(md);
			
 
				 	param->data_size = 0;
			
 
				 	return 0;
			
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 
				 		return r;
			
 
				 
			
 
				 	param->data_size = 0;
			
 
				-	return dm_hash_rename(param->name, new_name);
			
 
				+	return dm_hash_rename(param->event_nr, param->name, new_name);
			
 
				 }
			
 
				 
			
 
				 static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
			
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
 
				 	if (dm_suspended(md))
			
 
				 		r = dm_resume(md);
			
 
				 
			
 
				-	if (!r)
			
 
				+
			
 
				+	if (!r) {
			
 
				+		dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
			
 
				 		r = __dev_status(md, param);
			
 
				+	}
			
 
				 
			
 
				 	dm_put(md);
			
 
				 	return r;
			
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table,
 
				 		next = spec->next;
			
 
				 	}
			
 
				 
			
 
				+	r = dm_table_set_type(table);
			
 
				+	if (r) {
			
 
				+		DMWARN("unable to set table type");
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				 	return dm_table_complete(table);
			
 
				 }
			
 
				 
			
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	r = dm_table_alloc_md_mempools(t);
			
 
				+	if (r) {
			
 
				+		DMWARN("unable to allocate mempools for this table");
			
 
				+		dm_table_destroy(t);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	down_write(&_hash_lock);
			
 
				 	hc = dm_get_mdptr(md);
			
 
				 	if (!hc || hc->md != md) {
			
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				+	ti->num_flush_requests = 1;
			
 
				 	ti->private = lc;
			
 
				 	return 0;
			
 
				 
			
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 
				 	struct linear_c *lc = ti->private;
			
 
				 
			
 
				 	bio->bi_bdev = lc->dev->bdev;
			
 
				-	bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
			
 
				+	if (bio_sectors(bio))
			
 
				+		bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
			
 
				 }
			
 
				 
			
 
				 static int linear_map(struct dm_target *ti, struct bio *bio,
			
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
				 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
			
 
				 }
			
 
				 
			
 
				+static int linear_iterate_devices(struct dm_target *ti,
			
 
				+				  iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct linear_c *lc = ti->private;
			
 
				+
			
 
				+	return fn(ti, lc->dev, lc->start, data);
			
 
				+}
			
 
				+
			
 
				 static struct target_type linear_target = {
			
 
				 	.name   = "linear",
			
 
				-	.version= {1, 0, 3},
			
 
				+	.version = {1, 1, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = linear_ctr,
			
 
				 	.dtr    = linear_dtr,
			
@@ -142,6 +152,7 @@ static struct target_type linear_target = {
 
				 	.status = linear_status,
			
 
				 	.ioctl  = linear_ioctl,
			
 
				 	.merge  = linear_merge,
			
 
				+	.iterate_devices = linear_iterate_devices,
			
 
				 };
			
 
				 
			
 
				 int __init dm_linear_init(void)
			
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
 
				+/*
			
 
				+ * Copyright (C) 2006-2009 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the LGPL.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/bio.h>
			
 
				+#include <linux/dm-dirty-log.h>
			
 
				+#include <linux/device-mapper.h>
			
 
				+#include <linux/dm-log-userspace.h>
			
 
				+
			
 
				+#include "dm-log-userspace-transfer.h"
			
 
				+
			
 
				+struct flush_entry {
			
 
				+	int type;
			
 
				+	region_t region;
			
 
				+	struct list_head list;
			
 
				+};
			
 
				+
			
 
				+struct log_c {
			
 
				+	struct dm_target *ti;
			
 
				+	uint32_t region_size;
			
 
				+	region_t region_count;
			
 
				+	char uuid[DM_UUID_LEN];
			
 
				+
			
 
				+	char *usr_argv_str;
			
 
				+	uint32_t usr_argc;
			
 
				+
			
 
				+	/*
			
 
				+	 * in_sync_hint gets set when doing is_remote_recovering.  It
			
 
				+	 * represents the first region that needs recovery.  IOW, the
			
 
				+	 * first zero bit of sync_bits.  This can be useful for to limit
			
 
				+	 * traffic for calls like is_remote_recovering and get_resync_work,
			
 
				+	 * but be take care in its use for anything else.
			
 
				+	 */
			
 
				+	uint64_t in_sync_hint;
			
 
				+
			
 
				+	spinlock_t flush_lock;
			
 
				+	struct list_head flush_list;  /* only for clear and mark requests */
			
 
				+};
			
 
				+
			
 
				+static mempool_t *flush_entry_pool;
			
 
				+
			
 
				+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
			
 
				+{
			
 
				+	return kmalloc(sizeof(struct flush_entry), gfp_mask);
			
 
				+}
			
 
				+
			
 
				+static void flush_entry_free(void *element, void *pool_data)
			
 
				+{
			
 
				+	kfree(element);
			
 
				+}
			
 
				+
			
 
				+static int userspace_do_request(struct log_c *lc, const char *uuid,
			
 
				+				int request_type, char *data, size_t data_size,
			
 
				+				char *rdata, size_t *rdata_size)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the server isn't there, -ESRCH is returned,
			
 
				+	 * and we must keep trying until the server is
			
 
				+	 * restored.
			
 
				+	 */
			
 
				+retry:
			
 
				+	r = dm_consult_userspace(uuid, request_type, data,
			
 
				+				 data_size, rdata, rdata_size);
			
 
				+
			
 
				+	if (r != -ESRCH)
			
 
				+		return r;
			
 
				+
			
 
				+	DMERR(" Userspace log server not found.");
			
 
				+	while (1) {
			
 
				+		set_current_state(TASK_INTERRUPTIBLE);
			
 
				+		schedule_timeout(2*HZ);
			
 
				+		DMWARN("Attempting to contact userspace log server...");
			
 
				+		r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
			
 
				+					 strlen(lc->usr_argv_str) + 1,
			
 
				+					 NULL, NULL);
			
 
				+		if (!r)
			
 
				+			break;
			
 
				+	}
			
 
				+	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
			
 
				+	r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
			
 
				+				 0, NULL, NULL);
			
 
				+	if (!r)
			
 
				+		goto retry;
			
 
				+
			
 
				+	DMERR("Error trying to resume userspace log: %d", r);
			
 
				+
			
 
				+	return -ESRCH;
			
 
				+}
			
 
				+
			
 
				+static int build_constructor_string(struct dm_target *ti,
			
 
				+				    unsigned argc, char **argv,
			
 
				+				    char **ctr_str)
			
 
				+{
			
 
				+	int i, str_size;
			
 
				+	char *str = NULL;
			
 
				+
			
 
				+	*ctr_str = NULL;
			
 
				+
			
 
				+	for (i = 0, str_size = 0; i < argc; i++)
			
 
				+		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
			
 
				+
			
 
				+	str_size += 20; /* Max number of chars in a printed u64 number */
			
 
				+
			
 
				+	str = kzalloc(str_size, GFP_KERNEL);
			
 
				+	if (!str) {
			
 
				+		DMWARN("Unable to allocate memory for constructor string");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0, str_size = 0; i < argc; i++)
			
 
				+		str_size += sprintf(str + str_size, "%s ", argv[i]);
			
 
				+	str_size += sprintf(str + str_size, "%llu",
			
 
				+			    (unsigned long long)ti->len);
			
 
				+
			
 
				+	*ctr_str = str;
			
 
				+	return str_size;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_ctr
			
 
				+ *
			
 
				+ * argv contains:
			
 
				+ *	<UUID> <other args>
			
 
				+ * Where 'other args' is the userspace implementation specific log
			
 
				+ * arguments.  An example might be:
			
 
				+ *	<UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
			
 
				+ *
			
 
				+ * So, this module will strip off the <UUID> for identification purposes
			
 
				+ * when communicating with userspace about a log; but will pass on everything
			
 
				+ * else.
			
 
				+ */
			
 
				+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
			
 
				+			 unsigned argc, char **argv)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	int str_size;
			
 
				+	char *ctr_str = NULL;
			
 
				+	struct log_c *lc = NULL;
			
 
				+	uint64_t rdata;
			
 
				+	size_t rdata_size = sizeof(rdata);
			
 
				+
			
 
				+	if (argc < 3) {
			
 
				+		DMWARN("Too few arguments to userspace dirty log");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
			
 
				+	if (!lc) {
			
 
				+		DMWARN("Unable to allocate userspace log context.");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	lc->ti = ti;
			
 
				+
			
 
				+	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
			
 
				+		DMWARN("UUID argument too long.");
			
 
				+		kfree(lc);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
			
 
				+	spin_lock_init(&lc->flush_lock);
			
 
				+	INIT_LIST_HEAD(&lc->flush_list);
			
 
				+
			
 
				+	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
			
 
				+	if (str_size < 0) {
			
 
				+		kfree(lc);
			
 
				+		return str_size;
			
 
				+	}
			
 
				+
			
 
				+	/* Send table string */
			
 
				+	r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
			
 
				+				 ctr_str, str_size, NULL, NULL);
			
 
				+
			
 
				+	if (r == -ESRCH) {
			
 
				+		DMERR("Userspace log server not found");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* Since the region size does not change, get it now */
			
 
				+	rdata_size = sizeof(rdata);
			
 
				+	r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
			
 
				+				 NULL, 0, (char *)&rdata, &rdata_size);
			
 
				+
			
 
				+	if (r) {
			
 
				+		DMERR("Failed to get region size of dirty log");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	lc->region_size = (uint32_t)rdata;
			
 
				+	lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
			
 
				+
			
 
				+out:
			
 
				+	if (r) {
			
 
				+		kfree(lc);
			
 
				+		kfree(ctr_str);
			
 
				+	} else {
			
 
				+		lc->usr_argv_str = ctr_str;
			
 
				+		lc->usr_argc = argc;
			
 
				+		log->context = lc;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void userspace_dtr(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
			
 
				+				 NULL, 0,
			
 
				+				 NULL, NULL);
			
 
				+
			
 
				+	kfree(lc->usr_argv_str);
			
 
				+	kfree(lc);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+static int userspace_presuspend(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
			
 
				+				 NULL, 0,
			
 
				+				 NULL, NULL);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int userspace_postsuspend(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
			
 
				+				 NULL, 0,
			
 
				+				 NULL, NULL);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int userspace_resume(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	lc->in_sync_hint = 0;
			
 
				+	r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
			
 
				+				 NULL, 0,
			
 
				+				 NULL, NULL);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	return lc->region_size;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_is_clean
			
 
				+ *
			
 
				+ * Check whether a region is clean.  If there is any sort of
			
 
				+ * failure when consulting the server, we return not clean.
			
 
				+ *
			
 
				+ * Returns: 1 if clean, 0 otherwise
			
 
				+ */
			
 
				+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
			
 
				+{
			
 
				+	int r;
			
 
				+	uint64_t region64 = (uint64_t)region;
			
 
				+	int64_t is_clean;
			
 
				+	size_t rdata_size;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	rdata_size = sizeof(is_clean);
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
			
 
				+				 (char *)&region64, sizeof(region64),
			
 
				+				 (char *)&is_clean, &rdata_size);
			
 
				+
			
 
				+	return (r) ? 0 : (int)is_clean;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_in_sync
			
 
				+ *
			
 
				+ * Check if the region is in-sync.  If there is any sort
			
 
				+ * of failure when consulting the server, we assume that
			
 
				+ * the region is not in sync.
			
 
				+ *
			
 
				+ * If 'can_block' is set, return immediately
			
 
				+ *
			
 
				+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
			
 
				+ */
			
 
				+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
			
 
				+			     int can_block)
			
 
				+{
			
 
				+	int r;
			
 
				+	uint64_t region64 = region;
			
 
				+	int64_t in_sync;
			
 
				+	size_t rdata_size;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	/*
			
 
				+	 * We can never respond directly - even if in_sync_hint is
			
 
				+	 * set.  This is because another machine could see a device
			
 
				+	 * failure and mark the region out-of-sync.  If we don't go
			
 
				+	 * to userspace to ask, we might think the region is in-sync
			
 
				+	 * and allow a read to pick up data that is stale.  (This is
			
 
				+	 * very unlikely if a device actually fails; but it is very
			
 
				+	 * likely if a connection to one device from one machine fails.)
			
 
				+	 *
			
 
				+	 * There still might be a problem if the mirror caches the region
			
 
				+	 * state as in-sync... but then this call would not be made.  So,
			
 
				+	 * that is a mirror problem.
			
 
				+	 */
			
 
				+	if (!can_block)
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	rdata_size = sizeof(in_sync);
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
			
 
				+				 (char *)&region64, sizeof(region64),
			
 
				+				 (char *)&in_sync, &rdata_size);
			
 
				+	return (r) ? 0 : (int)in_sync;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_flush
			
 
				+ *
			
 
				+ * This function is ok to block.
			
 
				+ * The flush happens in two stages.  First, it sends all
			
 
				+ * clear/mark requests that are on the list.  Then it
			
 
				+ * tells the server to commit them.  This gives the
			
 
				+ * server a chance to optimise the commit, instead of
			
 
				+ * doing it for every request.
			
 
				+ *
			
 
				+ * Additionally, we could implement another thread that
			
 
				+ * sends the requests up to the server - reducing the
			
 
				+ * load on flush.  Then the flush would have less in
			
 
				+ * the list and be responsible for the finishing commit.
			
 
				+ *
			
 
				+ * Returns: 0 on success, < 0 on failure
			
 
				+ */
			
 
				+static int userspace_flush(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	unsigned long flags;
			
 
				+	struct log_c *lc = log->context;
			
 
				+	LIST_HEAD(flush_list);
			
 
				+	struct flush_entry *fe, *tmp_fe;
			
 
				+
			
 
				+	spin_lock_irqsave(&lc->flush_lock, flags);
			
 
				+	list_splice_init(&lc->flush_list, &flush_list);
			
 
				+	spin_unlock_irqrestore(&lc->flush_lock, flags);
			
 
				+
			
 
				+	if (list_empty(&flush_list))
			
 
				+		return 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * FIXME: Count up requests, group request types,
			
 
				+	 * allocate memory to stick all requests in and
			
 
				+	 * send to server in one go.  Failing the allocation,
			
 
				+	 * do it one by one.
			
 
				+	 */
			
 
				+
			
 
				+	list_for_each_entry(fe, &flush_list, list) {
			
 
				+		r = userspace_do_request(lc, lc->uuid, fe->type,
			
 
				+					 (char *)&fe->region,
			
 
				+					 sizeof(fe->region),
			
 
				+					 NULL, NULL);
			
 
				+		if (r)
			
 
				+			goto fail;
			
 
				+	}
			
 
				+
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
			
 
				+				 NULL, 0, NULL, NULL);
			
 
				+
			
 
				+fail:
			
 
				+	/*
			
 
				+	 * We can safely remove these entries, even if failure.
			
 
				+	 * Calling code will receive an error and will know that
			
 
				+	 * the log facility has failed.
			
 
				+	 */
			
 
				+	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
			
 
				+		list_del(&fe->list);
			
 
				+		mempool_free(fe, flush_entry_pool);
			
 
				+	}
			
 
				+
			
 
				+	if (r)
			
 
				+		dm_table_event(lc->ti->table);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_mark_region
			
 
				+ *
			
 
				+ * This function should avoid blocking unless absolutely required.
			
 
				+ * (Memory allocation is valid for blocking.)
			
 
				+ */
			
 
				+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct log_c *lc = log->context;
			
 
				+	struct flush_entry *fe;
			
 
				+
			
 
				+	/* Wait for an allocation, but _never_ fail */
			
 
				+	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
			
 
				+	BUG_ON(!fe);
			
 
				+
			
 
				+	spin_lock_irqsave(&lc->flush_lock, flags);
			
 
				+	fe->type = DM_ULOG_MARK_REGION;
			
 
				+	fe->region = region;
			
 
				+	list_add(&fe->list, &lc->flush_list);
			
 
				+	spin_unlock_irqrestore(&lc->flush_lock, flags);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_clear_region
			
 
				+ *
			
 
				+ * This function must not block.
			
 
				+ * So, the alloc can't block.  In the worst case, it is ok to
			
 
				+ * fail.  It would simply mean we can't clear the region.
			
 
				+ * Does nothing to current sync context, but does mean
			
 
				+ * the region will be re-sync'ed on a reload of the mirror
			
 
				+ * even though it is in-sync.
			
 
				+ */
			
 
				+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct log_c *lc = log->context;
			
 
				+	struct flush_entry *fe;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we fail to allocate, we skip the clearing of
			
 
				+	 * the region.  This doesn't hurt us in any way, except
			
 
				+	 * to cause the region to be resync'ed when the
			
 
				+	 * device is activated next time.
			
 
				+	 */
			
 
				+	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
			
 
				+	if (!fe) {
			
 
				+		DMERR("Failed to allocate memory to clear region.");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock_irqsave(&lc->flush_lock, flags);
			
 
				+	fe->type = DM_ULOG_CLEAR_REGION;
			
 
				+	fe->region = region;
			
 
				+	list_add(&fe->list, &lc->flush_list);
			
 
				+	spin_unlock_irqrestore(&lc->flush_lock, flags);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_get_resync_work
			
 
				+ *
			
 
				+ * Get a region that needs recovery.  It is valid to return
			
 
				+ * an error for this function.
			
 
				+ *
			
 
				+ * Returns: 1 if region filled, 0 if no work, <0 on error
			
 
				+ */
			
 
				+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
			
 
				+{
			
 
				+	int r;
			
 
				+	size_t rdata_size;
			
 
				+	struct log_c *lc = log->context;
			
 
				+	struct {
			
 
				+		int64_t i; /* 64-bit for mix arch compatibility */
			
 
				+		region_t r;
			
 
				+	} pkg;
			
 
				+
			
 
				+	if (lc->in_sync_hint >= lc->region_count)
			
 
				+		return 0;
			
 
				+
			
 
				+	rdata_size = sizeof(pkg);
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
			
 
				+				 NULL, 0,
			
 
				+				 (char *)&pkg, &rdata_size);
			
 
				+
			
 
				+	*region = pkg.r;
			
 
				+	return (r) ? r : (int)pkg.i;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_set_region_sync
			
 
				+ *
			
 
				+ * Set the sync status of a given region.  This function
			
 
				+ * must not fail.
			
 
				+ */
			
 
				+static void userspace_set_region_sync(struct dm_dirty_log *log,
			
 
				+				      region_t region, int in_sync)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct log_c *lc = log->context;
			
 
				+	struct {
			
 
				+		region_t r;
			
 
				+		int64_t i;
			
 
				+	} pkg;
			
 
				+
			
 
				+	pkg.r = region;
			
 
				+	pkg.i = (int64_t)in_sync;
			
 
				+
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
			
 
				+				 (char *)&pkg, sizeof(pkg),
			
 
				+				 NULL, NULL);
			
 
				+
			
 
				+	/*
			
 
				+	 * It would be nice to be able to report failures.
			
 
				+	 * However, it is easy emough to detect and resolve.
			
 
				+	 */
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_get_sync_count
			
 
				+ *
			
 
				+ * If there is any sort of failure when consulting the server,
			
 
				+ * we assume that the sync count is zero.
			
 
				+ *
			
 
				+ * Returns: sync count on success, 0 on failure
			
 
				+ */
			
 
				+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
			
 
				+{
			
 
				+	int r;
			
 
				+	size_t rdata_size;
			
 
				+	uint64_t sync_count;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	rdata_size = sizeof(sync_count);
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
			
 
				+				 NULL, 0,
			
 
				+				 (char *)&sync_count, &rdata_size);
			
 
				+
			
 
				+	if (r)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (sync_count >= lc->region_count)
			
 
				+		lc->in_sync_hint = lc->region_count;
			
 
				+
			
 
				+	return (region_t)sync_count;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_status
			
 
				+ *
			
 
				+ * Returns: amount of space consumed
			
 
				+ */
			
 
				+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
			
 
				+			    char *result, unsigned maxlen)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	size_t sz = (size_t)maxlen;
			
 
				+	struct log_c *lc = log->context;
			
 
				+
			
 
				+	switch (status_type) {
			
 
				+	case STATUSTYPE_INFO:
			
 
				+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
			
 
				+					 NULL, 0,
			
 
				+					 result, &sz);
			
 
				+
			
 
				+		if (r) {
			
 
				+			sz = 0;
			
 
				+			DMEMIT("%s 1 COM_FAILURE", log->type->name);
			
 
				+		}
			
 
				+		break;
			
 
				+	case STATUSTYPE_TABLE:
			
 
				+		sz = 0;
			
 
				+		DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
			
 
				+		       lc->uuid, lc->usr_argv_str);
			
 
				+		break;
			
 
				+	}
			
 
				+	return (r) ? 0 : (int)sz;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * userspace_is_remote_recovering
			
 
				+ *
			
 
				+ * Returns: 1 if region recovering, 0 otherwise
			
 
				+ */
			
 
				+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
			
 
				+					  region_t region)
			
 
				+{
			
 
				+	int r;
			
 
				+	uint64_t region64 = region;
			
 
				+	struct log_c *lc = log->context;
			
 
				+	static unsigned long long limit;
			
 
				+	struct {
			
 
				+		int64_t is_recovering;
			
 
				+		uint64_t in_sync_hint;
			
 
				+	} pkg;
			
 
				+	size_t rdata_size = sizeof(pkg);
			
 
				+
			
 
				+	/*
			
 
				+	 * Once the mirror has been reported to be in-sync,
			
 
				+	 * it will never again ask for recovery work.  So,
			
 
				+	 * we can safely say there is not a remote machine
			
 
				+	 * recovering if the device is in-sync.  (in_sync_hint
			
 
				+	 * must be reset at resume time.)
			
 
				+	 */
			
 
				+	if (region < lc->in_sync_hint)
			
 
				+		return 0;
			
 
				+	else if (jiffies < limit)
			
 
				+		return 1;
			
 
				+
			
 
				+	limit = jiffies + (HZ / 4);
			
 
				+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
			
 
				+				 (char *)&region64, sizeof(region64),
			
 
				+				 (char *)&pkg, &rdata_size);
			
 
				+	if (r)
			
 
				+		return 1;
			
 
				+
			
 
				+	lc->in_sync_hint = pkg.in_sync_hint;
			
 
				+
			
 
				+	return (int)pkg.is_recovering;
			
 
				+}
			
 
				+
			
 
				+static struct dm_dirty_log_type _userspace_type = {
			
 
				+	.name = "userspace",
			
 
				+	.module = THIS_MODULE,
			
 
				+	.ctr = userspace_ctr,
			
 
				+	.dtr = userspace_dtr,
			
 
				+	.presuspend = userspace_presuspend,
			
 
				+	.postsuspend = userspace_postsuspend,
			
 
				+	.resume = userspace_resume,
			
 
				+	.get_region_size = userspace_get_region_size,
			
 
				+	.is_clean = userspace_is_clean,
			
 
				+	.in_sync = userspace_in_sync,
			
 
				+	.flush = userspace_flush,
			
 
				+	.mark_region = userspace_mark_region,
			
 
				+	.clear_region = userspace_clear_region,
			
 
				+	.get_resync_work = userspace_get_resync_work,
			
 
				+	.set_region_sync = userspace_set_region_sync,
			
 
				+	.get_sync_count = userspace_get_sync_count,
			
 
				+	.status = userspace_status,
			
 
				+	.is_remote_recovering = userspace_is_remote_recovering,
			
 
				+};
			
 
				+
			
 
				+static int __init userspace_dirty_log_init(void)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+
			
 
				+	flush_entry_pool = mempool_create(100, flush_entry_alloc,
			
 
				+					  flush_entry_free, NULL);
			
 
				+
			
 
				+	if (!flush_entry_pool) {
			
 
				+		DMWARN("Unable to create flush_entry_pool:  No memory.");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_ulog_tfr_init();
			
 
				+	if (r) {
			
 
				+		DMWARN("Unable to initialize userspace log communications");
			
 
				+		mempool_destroy(flush_entry_pool);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_dirty_log_type_register(&_userspace_type);
			
 
				+	if (r) {
			
 
				+		DMWARN("Couldn't register userspace dirty log type");
			
 
				+		dm_ulog_tfr_exit();
			
 
				+		mempool_destroy(flush_entry_pool);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	DMINFO("version 1.0.0 loaded");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void __exit userspace_dirty_log_exit(void)
			
 
				+{
			
 
				+	dm_dirty_log_type_unregister(&_userspace_type);
			
 
				+	dm_ulog_tfr_exit();
			
 
				+	mempool_destroy(flush_entry_pool);
			
 
				+
			
 
				+	DMINFO("version 1.0.0 unloaded");
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+module_init(userspace_dirty_log_init);
			
 
				+module_exit(userspace_dirty_log_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
			
 
				+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
 
				+/*
			
 
				+ * Copyright (C) 2006-2009 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the LGPL.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <net/sock.h>
			
 
				+#include <linux/workqueue.h>
			
 
				+#include <linux/connector.h>
			
 
				+#include <linux/device-mapper.h>
			
 
				+#include <linux/dm-log-userspace.h>
			
 
				+
			
 
				+#include "dm-log-userspace-transfer.h"
			
 
				+
			
 
				+static uint32_t dm_ulog_seq;
			
 
				+
			
 
				+/*
			
 
				+ * Netlink/Connector is an unreliable protocol.  How long should
			
 
				+ * we wait for a response before assuming it was lost and retrying?
			
 
				+ * (If we do receive a response after this time, it will be discarded
			
 
				+ * and the response to the resent request will be waited for.
			
 
				+ */
			
 
				+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
			
 
				+
			
 
				+/*
			
 
				+ * Pre-allocated space for speed
			
 
				+ */
			
 
				+#define DM_ULOG_PREALLOCED_SIZE 512
			
 
				+static struct cn_msg *prealloced_cn_msg;
			
 
				+static struct dm_ulog_request *prealloced_ulog_tfr;
			
 
				+
			
 
				+static struct cb_id ulog_cn_id = {
			
 
				+	.idx = CN_IDX_DM,
			
 
				+	.val = CN_VAL_DM_USERSPACE_LOG
			
 
				+};
			
 
				+
			
 
				+static DEFINE_MUTEX(dm_ulog_lock);
			
 
				+
			
 
				+struct receiving_pkg {
			
 
				+	struct list_head list;
			
 
				+	struct completion complete;
			
 
				+
			
 
				+	uint32_t seq;
			
 
				+
			
 
				+	int error;
			
 
				+	size_t *data_size;
			
 
				+	char *data;
			
 
				+};
			
 
				+
			
 
				+static DEFINE_SPINLOCK(receiving_list_lock);
			
 
				+static struct list_head receiving_list;
			
 
				+
			
 
				+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct cn_msg *msg = prealloced_cn_msg;
			
 
				+
			
 
				+	memset(msg, 0, sizeof(struct cn_msg));
			
 
				+
			
 
				+	msg->id.idx = ulog_cn_id.idx;
			
 
				+	msg->id.val = ulog_cn_id.val;
			
 
				+	msg->ack = 0;
			
 
				+	msg->seq = tfr->seq;
			
 
				+	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
			
 
				+
			
 
				+	r = cn_netlink_send(msg, 0, gfp_any());
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Parameters for this function can be either msg or tfr, but not
			
 
				+ * both.  This function fills in the reply for a waiting request.
			
 
				+ * If just msg is given, then the reply is simply an ACK from userspace
			
 
				+ * that the request was received.
			
 
				+ *
			
 
				+ * Returns: 0 on success, -ENOENT on failure
			
 
				+ */
			
 
				+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
			
 
				+{
			
 
				+	uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
			
 
				+	struct receiving_pkg *pkg;
			
 
				+
			
 
				+	/*
			
 
				+	 * The 'receiving_pkg' entries in this list are statically
			
 
				+	 * allocated on the stack in 'dm_consult_userspace'.
			
 
				+	 * Each process that is waiting for a reply from the user
			
 
				+	 * space server will have an entry in this list.
			
 
				+	 *
			
 
				+	 * We are safe to do it this way because the stack space
			
 
				+	 * is unique to each process, but still addressable by
			
 
				+	 * other processes.
			
 
				+	 */
			
 
				+	list_for_each_entry(pkg, &receiving_list, list) {
			
 
				+		if (rtn_seq != pkg->seq)
			
 
				+			continue;
			
 
				+
			
 
				+		if (msg) {
			
 
				+			pkg->error = -msg->ack;
			
 
				+			/*
			
 
				+			 * If we are trying again, we will need to know our
			
 
				+			 * storage capacity.  Otherwise, along with the
			
 
				+			 * error code, we make explicit that we have no data.
			
 
				+			 */
			
 
				+			if (pkg->error != -EAGAIN)
			
 
				+				*(pkg->data_size) = 0;
			
 
				+		} else if (tfr->data_size > *(pkg->data_size)) {
			
 
				+			DMERR("Insufficient space to receive package [%u] "
			
 
				+			      "(%u vs %lu)", tfr->request_type,
			
 
				+			      tfr->data_size, *(pkg->data_size));
			
 
				+
			
 
				+			*(pkg->data_size) = 0;
			
 
				+			pkg->error = -ENOSPC;
			
 
				+		} else {
			
 
				+			pkg->error = tfr->error;
			
 
				+			memcpy(pkg->data, tfr->data, tfr->data_size);
			
 
				+			*(pkg->data_size) = tfr->data_size;
			
 
				+		}
			
 
				+		complete(&pkg->complete);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return -ENOENT;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This is the connector callback that delivers data
			
 
				+ * that was sent from userspace.
			
 
				+ */
			
 
				+static void cn_ulog_callback(void *data)
			
 
				+{
			
 
				+	struct cn_msg *msg = (struct cn_msg *)data;
			
 
				+	struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
			
 
				+
			
 
				+	spin_lock(&receiving_list_lock);
			
 
				+	if (msg->len == 0)
			
 
				+		fill_pkg(msg, NULL);
			
 
				+	else if (msg->len < sizeof(*tfr))
			
 
				+		DMERR("Incomplete message received (expected %u, got %u): [%u]",
			
 
				+		      (unsigned)sizeof(*tfr), msg->len, msg->seq);
			
 
				+	else
			
 
				+		fill_pkg(NULL, tfr);
			
 
				+	spin_unlock(&receiving_list_lock);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * dm_consult_userspace
			
 
				+ * @uuid: log's uuid (must be DM_UUID_LEN in size)
			
 
				+ * @request_type:  found in include/linux/dm-log-userspace.h
			
 
				+ * @data: data to tx to the server
			
 
				+ * @data_size: size of data in bytes
			
 
				+ * @rdata: place to put return data from server
			
 
				+ * @rdata_size: value-result (amount of space given/amount of space used)
			
 
				+ *
			
 
				+ * rdata_size is undefined on failure.
			
 
				+ *
			
 
				+ * Memory used to communicate with userspace is zero'ed
			
 
				+ * before populating to ensure that no unwanted bits leak
			
 
				+ * from kernel space to user-space.  All userspace log communications
			
 
				+ * between kernel and user space go through this function.
			
 
				+ *
			
 
				+ * Returns: 0 on success, -EXXX on failure
			
 
				+ **/
			
 
				+int dm_consult_userspace(const char *uuid, int request_type,
			
 
				+			 char *data, size_t data_size,
			
 
				+			 char *rdata, size_t *rdata_size)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	size_t dummy = 0;
			
 
				+	int overhead_size =
			
 
				+		sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
			
 
				+	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
			
 
				+	struct receiving_pkg pkg;
			
 
				+
			
 
				+	if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
			
 
				+		DMINFO("Size of tfr exceeds preallocated size");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (!rdata_size)
			
 
				+		rdata_size = &dummy;
			
 
				+resend:
			
 
				+	/*
			
 
				+	 * We serialize the sending of requests so we can
			
 
				+	 * use the preallocated space.
			
 
				+	 */
			
 
				+	mutex_lock(&dm_ulog_lock);
			
 
				+
			
 
				+	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
			
 
				+	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
			
 
				+	tfr->seq = dm_ulog_seq++;
			
 
				+
			
 
				+	/*
			
 
				+	 * Must be valid request type (all other bits set to
			
 
				+	 * zero).  This reserves other bits for possible future
			
 
				+	 * use.
			
 
				+	 */
			
 
				+	tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
			
 
				+
			
 
				+	tfr->data_size = data_size;
			
 
				+	if (data && data_size)
			
 
				+		memcpy(tfr->data, data, data_size);
			
 
				+
			
 
				+	memset(&pkg, 0, sizeof(pkg));
			
 
				+	init_completion(&pkg.complete);
			
 
				+	pkg.seq = tfr->seq;
			
 
				+	pkg.data_size = rdata_size;
			
 
				+	pkg.data = rdata;
			
 
				+	spin_lock(&receiving_list_lock);
			
 
				+	list_add(&(pkg.list), &receiving_list);
			
 
				+	spin_unlock(&receiving_list_lock);
			
 
				+
			
 
				+	r = dm_ulog_sendto_server(tfr);
			
 
				+
			
 
				+	mutex_unlock(&dm_ulog_lock);
			
 
				+
			
 
				+	if (r) {
			
 
				+		DMERR("Unable to send log request [%u] to userspace: %d",
			
 
				+		      request_type, r);
			
 
				+		spin_lock(&receiving_list_lock);
			
 
				+		list_del_init(&(pkg.list));
			
 
				+		spin_unlock(&receiving_list_lock);
			
 
				+
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
			
 
				+	spin_lock(&receiving_list_lock);
			
 
				+	list_del_init(&(pkg.list));
			
 
				+	spin_unlock(&receiving_list_lock);
			
 
				+	if (!r) {
			
 
				+		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
			
 
				+		       (strlen(uuid) > 8) ?
			
 
				+		       (uuid + (strlen(uuid) - 8)) : (uuid),
			
 
				+		       request_type, pkg.seq);
			
 
				+		goto resend;
			
 
				+	}
			
 
				+
			
 
				+	r = pkg.error;
			
 
				+	if (r == -EAGAIN)
			
 
				+		goto resend;
			
 
				+
			
 
				+out:
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_ulog_tfr_init(void)
			
 
				+{
			
 
				+	int r;
			
 
				+	void *prealloced;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&receiving_list);
			
 
				+
			
 
				+	prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
			
 
				+	if (!prealloced)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	prealloced_cn_msg = prealloced;
			
 
				+	prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
			
 
				+
			
 
				+	r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
			
 
				+	if (r) {
			
 
				+		cn_del_callback(&ulog_cn_id);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void dm_ulog_tfr_exit(void)
			
 
				+{
			
 
				+	cn_del_callback(&ulog_cn_id);
			
 
				+	kfree(prealloced_cn_msg);
			
 
				+}
			
--- a/drivers/md/dm-log-userspace-transfer.h
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
 
				+/*
			
 
				+ * Copyright (C) 2006-2009 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the LGPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
			
 
				+#define __DM_LOG_USERSPACE_TRANSFER_H__
			
 
				+
			
 
				+#define DM_MSG_PREFIX "dm-log-userspace"
			
 
				+
			
 
				+int dm_ulog_tfr_init(void);
			
 
				+void dm_ulog_tfr_exit(void);
			
 
				+int dm_consult_userspace(const char *uuid, int request_type,
			
 
				+			 char *data, size_t data_size,
			
 
				+			 char *rdata, size_t *rdata_size);
			
 
				+
			
 
				+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
			
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -412,11 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 
				 		/*
			
 
				 		 * Buffer holds both header and bitset.
			
 
				 		 */
			
 
				-		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
			
 
				-				       bitset_size,
			
 
				-				       ti->limits.logical_block_size);
			
 
				+		buf_size =
			
 
				+		    dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
			
 
				+				bdev_logical_block_size(lc->header_location.
			
 
				+							    bdev));
			
 
				 
			
 
				-		if (buf_size > dev->bdev->bd_inode->i_size) {
			
 
				+		if (buf_size > i_size_read(dev->bdev->bd_inode)) {
			
 
				 			DMWARN("log device %s too small: need %llu bytes",
			
 
				 				dev->name, (unsigned long long)buf_size);
			
 
				 			kfree(lc);
			
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
 
				 #include <linux/device-mapper.h>
			
 
				 
			
 
				 #include "dm-path-selector.h"
			
 
				-#include "dm-bio-record.h"
			
 
				 #include "dm-uevent.h"
			
 
				 
			
 
				 #include <linux/ctype.h>
			
@@ -35,6 +34,7 @@ struct pgpath {
 
				 
			
 
				 	struct dm_path path;
			
 
				 	struct work_struct deactivate_path;
			
 
				+	struct work_struct activate_path;
			
 
				 };
			
 
				 
			
 
				 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
			
@@ -64,8 +64,6 @@ struct multipath {
 
				 	spinlock_t lock;
			
 
				 
			
 
				 	const char *hw_handler_name;
			
 
				-	struct work_struct activate_path;
			
 
				-	struct pgpath *pgpath_to_activate;
			
 
				 	unsigned nr_priority_groups;
			
 
				 	struct list_head priority_groups;
			
 
				 	unsigned pg_init_required;	/* pg_init needs calling? */
			
@@ -84,7 +82,7 @@ struct multipath {
 
				 	unsigned pg_init_count;		/* Number of times pg_init called */
			
 
				 
			
 
				 	struct work_struct process_queued_ios;
			
 
				-	struct bio_list queued_ios;
			
 
				+	struct list_head queued_ios;
			
 
				 	unsigned queue_size;
			
 
				 
			
 
				 	struct work_struct trigger_event;
			
@@ -101,7 +99,7 @@ struct multipath {
 
				  */
			
 
				 struct dm_mpath_io {
			
 
				 	struct pgpath *pgpath;
			
 
				-	struct dm_bio_details details;
			
 
				+	size_t nr_bytes;
			
 
				 };
			
 
				 
			
 
				 typedef int (*action_fn) (struct pgpath *pgpath);
			
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void)
 
				 	if (pgpath) {
			
 
				 		pgpath->is_active = 1;
			
 
				 		INIT_WORK(&pgpath->deactivate_path, deactivate_path);
			
 
				+		INIT_WORK(&pgpath->activate_path, activate_path);
			
 
				 	}
			
 
				 
			
 
				 	return pgpath;
			
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void)
 
				 
			
 
				 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				 	struct pgpath *pgpath, *tmp;
			
 
				 	struct multipath *m = ti->private;
			
 
				 
			
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 
				 		if (m->hw_handler_name)
			
 
				 			scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
			
 
				 		dm_put_device(ti, pgpath->path.dev);
			
 
				-		spin_lock_irqsave(&m->lock, flags);
			
 
				-		if (m->pgpath_to_activate == pgpath)
			
 
				-			m->pgpath_to_activate = NULL;
			
 
				-		spin_unlock_irqrestore(&m->lock, flags);
			
 
				 		free_pgpath(pgpath);
			
 
				 	}
			
 
				 }
			
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 
				 	m = kzalloc(sizeof(*m), GFP_KERNEL);
			
 
				 	if (m) {
			
 
				 		INIT_LIST_HEAD(&m->priority_groups);
			
 
				+		INIT_LIST_HEAD(&m->queued_ios);
			
 
				 		spin_lock_init(&m->lock);
			
 
				 		m->queue_io = 1;
			
 
				 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
			
 
				 		INIT_WORK(&m->trigger_event, trigger_event);
			
 
				-		INIT_WORK(&m->activate_path, activate_path);
			
 
				 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
			
 
				 		if (!m->mpio_pool) {
			
 
				 			kfree(m);
			
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 
				 	m->pg_init_count = 0;
			
 
				 }
			
 
				 
			
 
				-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
			
 
				+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
			
 
				+			       size_t nr_bytes)
			
 
				 {
			
 
				 	struct dm_path *path;
			
 
				 
			
 
				-	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
			
 
				+	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
			
 
				 	if (!path)
			
 
				 		return -ENXIO;
			
 
				 
			
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void __choose_pgpath(struct multipath *m)
			
 
				+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
			
 
				 {
			
 
				 	struct priority_group *pg;
			
 
				 	unsigned bypassed = 1;
			
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m)
 
				 	if (m->next_pg) {
			
 
				 		pg = m->next_pg;
			
 
				 		m->next_pg = NULL;
			
 
				-		if (!__choose_path_in_pg(m, pg))
			
 
				+		if (!__choose_path_in_pg(m, pg, nr_bytes))
			
 
				 			return;
			
 
				 	}
			
 
				 
			
 
				 	/* Don't change PG until it has no remaining paths */
			
 
				-	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
			
 
				+	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m)
 
				 		list_for_each_entry(pg, &m->priority_groups, list) {
			
 
				 			if (pg->bypassed == bypassed)
			
 
				 				continue;
			
 
				-			if (!__choose_path_in_pg(m, pg))
			
 
				+			if (!__choose_path_in_pg(m, pg, nr_bytes))
			
 
				 				return;
			
 
				 		}
			
 
				 	} while (bypassed--);
			
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m)
 
				 		dm_noflush_suspending(m->ti));
			
 
				 }
			
 
				 
			
 
				-static int map_io(struct multipath *m, struct bio *bio,
			
 
				+static int map_io(struct multipath *m, struct request *clone,
			
 
				 		  struct dm_mpath_io *mpio, unsigned was_queued)
			
 
				 {
			
 
				 	int r = DM_MAPIO_REMAPPED;
			
 
				+	size_t nr_bytes = blk_rq_bytes(clone);
			
 
				 	unsigned long flags;
			
 
				 	struct pgpath *pgpath;
			
 
				+	struct block_device *bdev;
			
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				 
			
 
				 	/* Do we need to select a new pgpath? */
			
 
				 	if (!m->current_pgpath ||
			
 
				 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
			
 
				-		__choose_pgpath(m);
			
 
				+		__choose_pgpath(m, nr_bytes);
			
 
				 
			
 
				 	pgpath = m->current_pgpath;
			
 
				 
			
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio,
 
				 	if ((pgpath && m->queue_io) ||
			
 
				 	    (!pgpath && m->queue_if_no_path)) {
			
 
				 		/* Queue for the daemon to resubmit */
			
 
				-		bio_list_add(&m->queued_ios, bio);
			
 
				+		list_add_tail(&clone->queuelist, &m->queued_ios);
			
 
				 		m->queue_size++;
			
 
				 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
			
 
				 		    !m->queue_io)
			
 
				 			queue_work(kmultipathd, &m->process_queued_ios);
			
 
				 		pgpath = NULL;
			
 
				 		r = DM_MAPIO_SUBMITTED;
			
 
				-	} else if (pgpath)
			
 
				-		bio->bi_bdev = pgpath->path.dev->bdev;
			
 
				-	else if (__must_push_back(m))
			
 
				+	} else if (pgpath) {
			
 
				+		bdev = pgpath->path.dev->bdev;
			
 
				+		clone->q = bdev_get_queue(bdev);
			
 
				+		clone->rq_disk = bdev->bd_disk;
			
 
				+	} else if (__must_push_back(m))
			
 
				 		r = DM_MAPIO_REQUEUE;
			
 
				 	else
			
 
				 		r = -EIO;	/* Failed */
			
 
				 
			
 
				 	mpio->pgpath = pgpath;
			
 
				+	mpio->nr_bytes = nr_bytes;
			
 
				+
			
 
				+	if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
			
 
				+		pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
			
 
				+					      nr_bytes);
			
 
				 
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 
			
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
 
				 {
			
 
				 	int r;
			
 
				 	unsigned long flags;
			
 
				-	struct bio *bio = NULL, *next;
			
 
				 	struct dm_mpath_io *mpio;
			
 
				 	union map_info *info;
			
 
				+	struct request *clone, *n;
			
 
				+	LIST_HEAD(cl);
			
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				-	bio = bio_list_get(&m->queued_ios);
			
 
				+	list_splice_init(&m->queued_ios, &cl);
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 
			
 
				-	while (bio) {
			
 
				-		next = bio->bi_next;
			
 
				-		bio->bi_next = NULL;
			
 
				+	list_for_each_entry_safe(clone, n, &cl, queuelist) {
			
 
				+		list_del_init(&clone->queuelist);
			
 
				 
			
 
				-		info = dm_get_mapinfo(bio);
			
 
				+		info = dm_get_rq_mapinfo(clone);
			
 
				 		mpio = info->ptr;
			
 
				 
			
 
				-		r = map_io(m, bio, mpio, 1);
			
 
				-		if (r < 0)
			
 
				-			bio_endio(bio, r);
			
 
				-		else if (r == DM_MAPIO_REMAPPED)
			
 
				-			generic_make_request(bio);
			
 
				-		else if (r == DM_MAPIO_REQUEUE)
			
 
				-			bio_endio(bio, -EIO);
			
 
				-
			
 
				-		bio = next;
			
 
				+		r = map_io(m, clone, mpio, 1);
			
 
				+		if (r < 0) {
			
 
				+			mempool_free(mpio, m->mpio_pool);
			
 
				+			dm_kill_unmapped_request(clone, r);
			
 
				+		} else if (r == DM_MAPIO_REMAPPED)
			
 
				+			dm_dispatch_request(clone);
			
 
				+		else if (r == DM_MAPIO_REQUEUE) {
			
 
				+			mempool_free(mpio, m->mpio_pool);
			
 
				+			dm_requeue_unmapped_request(clone);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work)
 
				 {
			
 
				 	struct multipath *m =
			
 
				 		container_of(work, struct multipath, process_queued_ios);
			
 
				-	struct pgpath *pgpath = NULL;
			
 
				-	unsigned init_required = 0, must_queue = 1;
			
 
				+	struct pgpath *pgpath = NULL, *tmp;
			
 
				+	unsigned must_queue = 1;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work)
 
				 		goto out;
			
 
				 
			
 
				 	if (!m->current_pgpath)
			
 
				-		__choose_pgpath(m);
			
 
				+		__choose_pgpath(m, 0);
			
 
				 
			
 
				 	pgpath = m->current_pgpath;
			
 
				 
			
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work)
 
				 		must_queue = 0;
			
 
				 
			
 
				 	if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
			
 
				-		m->pgpath_to_activate = pgpath;
			
 
				 		m->pg_init_count++;
			
 
				 		m->pg_init_required = 0;
			
 
				-		m->pg_init_in_progress = 1;
			
 
				-		init_required = 1;
			
 
				+		list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
			
 
				+			if (queue_work(kmpath_handlerd, &tmp->activate_path))
			
 
				+				m->pg_init_in_progress++;
			
 
				+		}
			
 
				 	}
			
 
				-
			
 
				 out:
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				-
			
 
				-	if (init_required)
			
 
				-		queue_work(kmpath_handlerd, &m->activate_path);
			
 
				-
			
 
				 	if (!must_queue)
			
 
				 		dispatch_queued_ios(m);
			
 
				 }
			
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				+	if (ps_argc > as->argc) {
			
 
				+		dm_put_path_selector(pst);
			
 
				+		ti->error = "not enough arguments for path selector";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	r = pst->create(&pg->ps, ps_argc, as->argv);
			
 
				 	if (r) {
			
 
				 		dm_put_path_selector(pst);
			
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 
				 	}
			
 
				 
			
 
				 	if (m->hw_handler_name) {
			
 
				-		r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
			
 
				-				   m->hw_handler_name);
			
 
				+		struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
			
 
				+
			
 
				+		r = scsi_dh_attach(q, m->hw_handler_name);
			
 
				+		if (r == -EBUSY) {
			
 
				+			/*
			
 
				+			 * Already attached to different hw_handler,
			
 
				+			 * try to reattach with correct one.
			
 
				+			 */
			
 
				+			scsi_dh_detach(q);
			
 
				+			r = scsi_dh_attach(q, m->hw_handler_name);
			
 
				+		}
			
 
				+
			
 
				 		if (r < 0) {
			
 
				+			ti->error = "error attaching hardware handler";
			
 
				 			dm_put_device(ti, p->path.dev);
			
 
				 			goto bad;
			
 
				 		}
			
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
 
				 	if (!hw_argc)
			
 
				 		return 0;
			
 
				 
			
 
				+	if (hw_argc > as->argc) {
			
 
				+		ti->error = "not enough arguments for hardware handler";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
			
 
				 	request_module("scsi_dh_%s", m->hw_handler_name);
			
 
				 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
			
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				+	ti->num_flush_requests = 1;
			
 
				+
			
 
				 	return 0;
			
 
				 
			
 
				  bad:
			
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti)
 
				 
			
 
				 	flush_workqueue(kmpath_handlerd);
			
 
				 	flush_workqueue(kmultipathd);
			
 
				+	flush_scheduled_work();
			
 
				 	free_multipath(m);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Map bios, recording original fields for later in case we have to resubmit
			
 
				+ * Map cloned requests
			
 
				  */
			
 
				-static int multipath_map(struct dm_target *ti, struct bio *bio,
			
 
				+static int multipath_map(struct dm_target *ti, struct request *clone,
			
 
				 			 union map_info *map_context)
			
 
				 {
			
 
				 	int r;
			
 
				 	struct dm_mpath_io *mpio;
			
 
				 	struct multipath *m = (struct multipath *) ti->private;
			
 
				 
			
 
				-	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
			
 
				-	dm_bio_record(&mpio->details, bio);
			
 
				+	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
			
 
				+	if (!mpio)
			
 
				+		/* ENOMEM, requeue */
			
 
				+		return DM_MAPIO_REQUEUE;
			
 
				+	memset(mpio, 0, sizeof(*mpio));
			
 
				 
			
 
				 	map_context->ptr = mpio;
			
 
				-	bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
			
 
				-	r = map_io(m, bio, mpio, 0);
			
 
				+	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
			
 
				+	r = map_io(m, clone, mpio, 0);
			
 
				 	if (r < 0 || r == DM_MAPIO_REQUEUE)
			
 
				 		mempool_free(mpio, m->mpio_pool);
			
 
				 
			
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath)
 
				 
			
 
				 	pgpath->is_active = 1;
			
 
				 
			
 
				-	m->current_pgpath = NULL;
			
 
				-	if (!m->nr_valid_paths++ && m->queue_size)
			
 
				+	if (!m->nr_valid_paths++ && m->queue_size) {
			
 
				+		m->current_pgpath = NULL;
			
 
				 		queue_work(kmultipathd, &m->process_queued_ios);
			
 
				+	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
			
 
				+		if (queue_work(kmpath_handlerd, &pgpath->activate_path))
			
 
				+			m->pg_init_in_progress++;
			
 
				+	}
			
 
				 
			
 
				 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
			
 
				 		      pgpath->path.dev->name, m->nr_valid_paths);
			
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors)
 
				 
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				 	if (errors) {
			
 
				-		DMERR("Could not failover device. Error %d.", errors);
			
 
				-		m->current_pgpath = NULL;
			
 
				-		m->current_pg = NULL;
			
 
				+		if (pgpath == m->current_pgpath) {
			
 
				+			DMERR("Could not failover device. Error %d.", errors);
			
 
				+			m->current_pgpath = NULL;
			
 
				+			m->current_pg = NULL;
			
 
				+		}
			
 
				 	} else if (!m->pg_init_required) {
			
 
				 		m->queue_io = 0;
			
 
				 		pg->bypassed = 0;
			
 
				 	}
			
 
				 
			
 
				-	m->pg_init_in_progress = 0;
			
 
				-	queue_work(kmultipathd, &m->process_queued_ios);
			
 
				+	m->pg_init_in_progress--;
			
 
				+	if (!m->pg_init_in_progress)
			
 
				+		queue_work(kmultipathd, &m->process_queued_ios);
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 }
			
 
				 
			
 
				 static void activate_path(struct work_struct *work)
			
 
				 {
			
 
				 	int ret;
			
 
				-	struct multipath *m =
			
 
				-		container_of(work, struct multipath, activate_path);
			
 
				-	struct dm_path *path;
			
 
				-	unsigned long flags;
			
 
				+	struct pgpath *pgpath =
			
 
				+		container_of(work, struct pgpath, activate_path);
			
 
				 
			
 
				-	spin_lock_irqsave(&m->lock, flags);
			
 
				-	path = &m->pgpath_to_activate->path;
			
 
				-	m->pgpath_to_activate = NULL;
			
 
				-	spin_unlock_irqrestore(&m->lock, flags);
			
 
				-	if (!path)
			
 
				-		return;
			
 
				-	ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
			
 
				-	pg_init_done(path, ret);
			
 
				+	ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
			
 
				+	pg_init_done(&pgpath->path, ret);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * end_io handling
			
 
				  */
			
 
				-static int do_end_io(struct multipath *m, struct bio *bio,
			
 
				+static int do_end_io(struct multipath *m, struct request *clone,
			
 
				 		     int error, struct dm_mpath_io *mpio)
			
 
				 {
			
 
				+	/*
			
 
				+	 * We don't queue any clone request inside the multipath target
			
 
				+	 * during end I/O handling, since those clone requests don't have
			
 
				+	 * bio clones.  If we queue them inside the multipath target,
			
 
				+	 * we need to make bio clones, that requires memory allocation.
			
 
				+	 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
			
 
				+	 *  don't have bio clones.)
			
 
				+	 * Instead of queueing the clone request here, we queue the original
			
 
				+	 * request into dm core, which will remake a clone request and
			
 
				+	 * clone bios for it and resubmit it later.
			
 
				+	 */
			
 
				+	int r = DM_ENDIO_REQUEUE;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				-	if (!error)
			
 
				+	if (!error && !clone->errors)
			
 
				 		return 0;	/* I/O complete */
			
 
				 
			
 
				-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
			
 
				-		return error;
			
 
				-
			
 
				 	if (error == -EOPNOTSUPP)
			
 
				 		return error;
			
 
				 
			
 
				-	spin_lock_irqsave(&m->lock, flags);
			
 
				-	if (!m->nr_valid_paths) {
			
 
				-		if (__must_push_back(m)) {
			
 
				-			spin_unlock_irqrestore(&m->lock, flags);
			
 
				-			return DM_ENDIO_REQUEUE;
			
 
				-		} else if (!m->queue_if_no_path) {
			
 
				-			spin_unlock_irqrestore(&m->lock, flags);
			
 
				-			return -EIO;
			
 
				-		} else {
			
 
				-			spin_unlock_irqrestore(&m->lock, flags);
			
 
				-			goto requeue;
			
 
				-		}
			
 
				-	}
			
 
				-	spin_unlock_irqrestore(&m->lock, flags);
			
 
				-
			
 
				 	if (mpio->pgpath)
			
 
				 		fail_path(mpio->pgpath);
			
 
				 
			
 
				-      requeue:
			
 
				-	dm_bio_restore(&mpio->details, bio);
			
 
				-
			
 
				-	/* queue for the daemon to resubmit or fail */
			
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				-	bio_list_add(&m->queued_ios, bio);
			
 
				-	m->queue_size++;
			
 
				-	if (!m->queue_io)
			
 
				-		queue_work(kmultipathd, &m->process_queued_ios);
			
 
				+	if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
			
 
				+		r = -EIO;
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				 
			
 
				-	return DM_ENDIO_INCOMPLETE;	/* io not complete */
			
 
				+	return r;
			
 
				 }
			
 
				 
			
 
				-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
			
 
				+static int multipath_end_io(struct dm_target *ti, struct request *clone,
			
 
				 			    int error, union map_info *map_context)
			
 
				 {
			
 
				 	struct multipath *m = ti->private;
			
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
 
				 	struct path_selector *ps;
			
 
				 	int r;
			
 
				 
			
 
				-	r  = do_end_io(m, bio, error, mpio);
			
 
				+	r  = do_end_io(m, clone, error, mpio);
			
 
				 	if (pgpath) {
			
 
				 		ps = &pgpath->pg->ps;
			
 
				 		if (ps->type->end_io)
			
 
				-			ps->type->end_io(ps, &pgpath->path);
			
 
				+			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
			
 
				 	}
			
 
				-	if (r != DM_ENDIO_INCOMPLETE)
			
 
				-		mempool_free(mpio, m->mpio_pool);
			
 
				+	mempool_free(mpio, m->mpio_pool);
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 
				 	spin_lock_irqsave(&m->lock, flags);
			
 
				 
			
 
				 	if (!m->current_pgpath)
			
 
				-		__choose_pgpath(m);
			
 
				+		__choose_pgpath(m, 0);
			
 
				 
			
 
				 	if (m->current_pgpath) {
			
 
				 		bdev = m->current_pgpath->path.dev->bdev;
			
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 
				 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
			
 
				 }
			
 
				 
			
 
				+static int multipath_iterate_devices(struct dm_target *ti,
			
 
				+				     iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct multipath *m = ti->private;
			
 
				+	struct priority_group *pg;
			
 
				+	struct pgpath *p;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	list_for_each_entry(pg, &m->priority_groups, list) {
			
 
				+		list_for_each_entry(p, &pg->pgpaths, list) {
			
 
				+			ret = fn(ti, p->path.dev, ti->begin, data);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int __pgpath_busy(struct pgpath *pgpath)
			
 
				+{
			
 
				+	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
			
 
				+
			
 
				+	return dm_underlying_device_busy(q);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We return "busy", only when we can map I/Os but underlying devices
			
 
				+ * are busy (so even if we map I/Os now, the I/Os will wait on
			
 
				+ * the underlying queue).
			
 
				+ * In other words, if we want to kill I/Os or queue them inside us
			
 
				+ * due to map unavailability, we don't return "busy".  Otherwise,
			
 
				+ * dm core won't give us the I/Os and we can't do what we want.
			
 
				+ */
			
 
				+static int multipath_busy(struct dm_target *ti)
			
 
				+{
			
 
				+	int busy = 0, has_active = 0;
			
 
				+	struct multipath *m = ti->private;
			
 
				+	struct priority_group *pg;
			
 
				+	struct pgpath *pgpath;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&m->lock, flags);
			
 
				+
			
 
				+	/* Guess which priority_group will be used at next mapping time */
			
 
				+	if (unlikely(!m->current_pgpath && m->next_pg))
			
 
				+		pg = m->next_pg;
			
 
				+	else if (likely(m->current_pg))
			
 
				+		pg = m->current_pg;
			
 
				+	else
			
 
				+		/*
			
 
				+		 * We don't know which pg will be used at next mapping time.
			
 
				+		 * We don't call __choose_pgpath() here to avoid to trigger
			
 
				+		 * pg_init just by busy checking.
			
 
				+		 * So we don't know whether underlying devices we will be using
			
 
				+		 * at next mapping time are busy or not. Just try mapping.
			
 
				+		 */
			
 
				+		goto out;
			
 
				+
			
 
				+	/*
			
 
				+	 * If there is one non-busy active path at least, the path selector
			
 
				+	 * will be able to select it. So we consider such a pg as not busy.
			
 
				+	 */
			
 
				+	busy = 1;
			
 
				+	list_for_each_entry(pgpath, &pg->pgpaths, list)
			
 
				+		if (pgpath->is_active) {
			
 
				+			has_active = 1;
			
 
				+
			
 
				+			if (!__pgpath_busy(pgpath)) {
			
 
				+				busy = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+	if (!has_active)
			
 
				+		/*
			
 
				+		 * No active path in this pg, so this pg won't be used and
			
 
				+		 * the current_pg will be changed at next mapping time.
			
 
				+		 * We need to try mapping to determine it.
			
 
				+		 */
			
 
				+		busy = 0;
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock_irqrestore(&m->lock, flags);
			
 
				+
			
 
				+	return busy;
			
 
				+}
			
 
				+
			
 
				 /*-----------------------------------------------------------------
			
 
				  * Module setup
			
 
				  *---------------------------------------------------------------*/
			
 
				 static struct target_type multipath_target = {
			
 
				 	.name = "multipath",
			
 
				-	.version = {1, 0, 5},
			
 
				+	.version = {1, 1, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = multipath_ctr,
			
 
				 	.dtr = multipath_dtr,
			
 
				-	.map = multipath_map,
			
 
				-	.end_io = multipath_end_io,
			
 
				+	.map_rq = multipath_map,
			
 
				+	.rq_end_io = multipath_end_io,
			
 
				 	.presuspend = multipath_presuspend,
			
 
				 	.resume = multipath_resume,
			
 
				 	.status = multipath_status,
			
 
				 	.message = multipath_message,
			
 
				 	.ioctl  = multipath_ioctl,
			
 
				+	.iterate_devices = multipath_iterate_devices,
			
 
				+	.busy = multipath_busy,
			
 
				 };
			
 
				 
			
 
				 static int __init dm_multipath_init(void)
			
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
 
				 	 * the path fails.
			
 
				 	 */
			
 
				 	struct dm_path *(*select_path) (struct path_selector *ps,
			
 
				-				     unsigned *repeat_count);
			
 
				+					unsigned *repeat_count,
			
 
				+					size_t nr_bytes);
			
 
				 
			
 
				 	/*
			
 
				 	 * Notify the selector that a path has failed.
			
@@ -75,7 +76,10 @@ struct path_selector_type {
 
				 	int (*status) (struct path_selector *ps, struct dm_path *path,
			
 
				 		       status_type_t type, char *result, unsigned int maxlen);
			
 
				 
			
 
				-	int (*end_io) (struct path_selector *ps, struct dm_path *path);
			
 
				+	int (*start_io) (struct path_selector *ps, struct dm_path *path,
			
 
				+			 size_t nr_bytes);
			
 
				+	int (*end_io) (struct path_selector *ps, struct dm_path *path,
			
 
				+		       size_t nr_bytes);
			
 
				 };
			
 
				 
			
 
				 /* Register a path selector */
			
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
 
				+/*
			
 
				+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
			
 
				+ * Copyright (C) 2006-2009 NEC Corporation.
			
 
				+ *
			
 
				+ * dm-queue-length.c
			
 
				+ *
			
 
				+ * Module Author: Stefan Bader, IBM
			
 
				+ * Modified by: Kiyoshi Ueda, NEC
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ *
			
 
				+ * queue-length path selector - choose a path with the least number of
			
 
				+ * in-flight I/Os.
			
 
				+ */
			
 
				+
			
 
				+#include "dm.h"
			
 
				+#include "dm-path-selector.h"
			
 
				+
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/ctype.h>
			
 
				+#include <linux/errno.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <asm/atomic.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX	"multipath queue-length"
			
 
				+#define QL_MIN_IO	128
			
 
				+#define QL_VERSION	"0.1.0"
			
 
				+
			
 
				+struct selector {
			
 
				+	struct list_head	valid_paths;
			
 
				+	struct list_head	failed_paths;
			
 
				+};
			
 
				+
			
 
				+struct path_info {
			
 
				+	struct list_head	list;
			
 
				+	struct dm_path		*path;
			
 
				+	unsigned		repeat_count;
			
 
				+	atomic_t		qlen;	/* the number of in-flight I/Os */
			
 
				+};
			
 
				+
			
 
				+static struct selector *alloc_selector(void)
			
 
				+{
			
 
				+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
			
 
				+
			
 
				+	if (s) {
			
 
				+		INIT_LIST_HEAD(&s->valid_paths);
			
 
				+		INIT_LIST_HEAD(&s->failed_paths);
			
 
				+	}
			
 
				+
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
			
 
				+{
			
 
				+	struct selector *s = alloc_selector();
			
 
				+
			
 
				+	if (!s)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ps->context = s;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void ql_free_paths(struct list_head *paths)
			
 
				+{
			
 
				+	struct path_info *pi, *next;
			
 
				+
			
 
				+	list_for_each_entry_safe(pi, next, paths, list) {
			
 
				+		list_del(&pi->list);
			
 
				+		kfree(pi);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ql_destroy(struct path_selector *ps)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+
			
 
				+	ql_free_paths(&s->valid_paths);
			
 
				+	ql_free_paths(&s->failed_paths);
			
 
				+	kfree(s);
			
 
				+	ps->context = NULL;
			
 
				+}
			
 
				+
			
 
				+static int ql_status(struct path_selector *ps, struct dm_path *path,
			
 
				+		     status_type_t type, char *result, unsigned maxlen)
			
 
				+{
			
 
				+	unsigned sz = 0;
			
 
				+	struct path_info *pi;
			
 
				+
			
 
				+	/* When called with NULL path, return selector status/args. */
			
 
				+	if (!path)
			
 
				+		DMEMIT("0 ");
			
 
				+	else {
			
 
				+		pi = path->pscontext;
			
 
				+
			
 
				+		switch (type) {
			
 
				+		case STATUSTYPE_INFO:
			
 
				+			DMEMIT("%d ", atomic_read(&pi->qlen));
			
 
				+			break;
			
 
				+		case STATUSTYPE_TABLE:
			
 
				+			DMEMIT("%u ", pi->repeat_count);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return sz;
			
 
				+}
			
 
				+
			
 
				+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
			
 
				+		       int argc, char **argv, char **error)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi;
			
 
				+	unsigned repeat_count = QL_MIN_IO;
			
 
				+
			
 
				+	/*
			
 
				+	 * Arguments: [<repeat_count>]
			
 
				+	 * 	<repeat_count>: The number of I/Os before switching path.
			
 
				+	 * 			If not given, default (QL_MIN_IO) is used.
			
 
				+	 */
			
 
				+	if (argc > 1) {
			
 
				+		*error = "queue-length ps: incorrect number of arguments";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
			
 
				+		*error = "queue-length ps: invalid repeat count";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/* Allocate the path information structure */
			
 
				+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
			
 
				+	if (!pi) {
			
 
				+		*error = "queue-length ps: Error allocating path information";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	pi->path = path;
			
 
				+	pi->repeat_count = repeat_count;
			
 
				+	atomic_set(&pi->qlen, 0);
			
 
				+
			
 
				+	path->pscontext = pi;
			
 
				+
			
 
				+	list_add_tail(&pi->list, &s->valid_paths);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	list_move(&pi->list, &s->failed_paths);
			
 
				+}
			
 
				+
			
 
				+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	list_move_tail(&pi->list, &s->valid_paths);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Select a path having the minimum number of in-flight I/Os
			
 
				+ */
			
 
				+static struct dm_path *ql_select_path(struct path_selector *ps,
			
 
				+				      unsigned *repeat_count, size_t nr_bytes)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi = NULL, *best = NULL;
			
 
				+
			
 
				+	if (list_empty(&s->valid_paths))
			
 
				+		return NULL;
			
 
				+
			
 
				+	/* Change preferred (first in list) path to evenly balance. */
			
 
				+	list_move_tail(s->valid_paths.next, &s->valid_paths);
			
 
				+
			
 
				+	list_for_each_entry(pi, &s->valid_paths, list) {
			
 
				+		if (!best ||
			
 
				+		    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
			
 
				+			best = pi;
			
 
				+
			
 
				+		if (!atomic_read(&best->qlen))
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	if (!best)
			
 
				+		return NULL;
			
 
				+
			
 
				+	*repeat_count = best->repeat_count;
			
 
				+
			
 
				+	return best->path;
			
 
				+}
			
 
				+
			
 
				+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
			
 
				+		       size_t nr_bytes)
			
 
				+{
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	atomic_inc(&pi->qlen);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
			
 
				+		     size_t nr_bytes)
			
 
				+{
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	atomic_dec(&pi->qlen);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct path_selector_type ql_ps = {
			
 
				+	.name		= "queue-length",
			
 
				+	.module		= THIS_MODULE,
			
 
				+	.table_args	= 1,
			
 
				+	.info_args	= 1,
			
 
				+	.create		= ql_create,
			
 
				+	.destroy	= ql_destroy,
			
 
				+	.status		= ql_status,
			
 
				+	.add_path	= ql_add_path,
			
 
				+	.fail_path	= ql_fail_path,
			
 
				+	.reinstate_path	= ql_reinstate_path,
			
 
				+	.select_path	= ql_select_path,
			
 
				+	.start_io	= ql_start_io,
			
 
				+	.end_io		= ql_end_io,
			
 
				+};
			
 
				+
			
 
				+static int __init dm_ql_init(void)
			
 
				+{
			
 
				+	int r = dm_register_path_selector(&ql_ps);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("register failed %d", r);
			
 
				+
			
 
				+	DMINFO("version " QL_VERSION " loaded");
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __exit dm_ql_exit(void)
			
 
				+{
			
 
				+	int r = dm_unregister_path_selector(&ql_ps);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("unregister failed %d", r);
			
 
				+}
			
 
				+
			
 
				+module_init(dm_ql_init);
			
 
				+module_exit(dm_ql_exit);
			
 
				+
			
 
				+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
			
 
				+MODULE_DESCRIPTION(
			
 
				+	"(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
			
 
				+	DM_NAME " path selector to balance the number of in-flight I/Os"
			
 
				+);
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int mirror_iterate_devices(struct dm_target *ti,
			
 
				+				  iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct mirror_set *ms = ti->private;
			
 
				+	int ret = 0;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; !ret && i < ms->nr_mirrors; i++)
			
 
				+		ret = fn(ti, ms->mirror[i].dev,
			
 
				+			 ms->mirror[i].offset, data);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static struct target_type mirror_target = {
			
 
				 	.name	 = "mirror",
			
 
				-	.version = {1, 0, 20},
			
 
				+	.version = {1, 12, 0},
			
 
				 	.module	 = THIS_MODULE,
			
 
				 	.ctr	 = mirror_ctr,
			
 
				 	.dtr	 = mirror_dtr,
			
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
 
				 	.postsuspend = mirror_postsuspend,
			
 
				 	.resume	 = mirror_resume,
			
 
				 	.status	 = mirror_status,
			
 
				+	.iterate_devices = mirror_iterate_devices,
			
 
				 };
			
 
				 
			
 
				 static int __init dm_mirror_init(void)
			
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
 
				 
			
 
				 	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
			
 
				 	if (unlikely(!nreg))
			
 
				-		nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
			
 
				+		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
			
 
				 
			
 
				 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
			
 
				 		      DM_RH_CLEAN : DM_RH_NOSYNC;
			
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
 
				 }
			
 
				 
			
 
				 static struct dm_path *rr_select_path(struct path_selector *ps,
			
 
				-				   unsigned *repeat_count)
			
 
				+				      unsigned *repeat_count, size_t nr_bytes)
			
 
				 {
			
 
				 	struct selector *s = (struct selector *) ps->context;
			
 
				 	struct path_info *pi = NULL;
			
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
 
				+/*
			
 
				+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
			
 
				+ *
			
 
				+ * Module Author: Kiyoshi Ueda
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ *
			
 
				+ * Throughput oriented path selector.
			
 
				+ */
			
 
				+
			
 
				+#include "dm.h"
			
 
				+#include "dm-path-selector.h"
			
 
				+
			
 
				+#define DM_MSG_PREFIX	"multipath service-time"
			
 
				+#define ST_MIN_IO	1
			
 
				+#define ST_MAX_RELATIVE_THROUGHPUT	100
			
 
				+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
			
 
				+#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
			
 
				+#define ST_VERSION	"0.2.0"
			
 
				+
			
 
				+struct selector {
			
 
				+	struct list_head valid_paths;
			
 
				+	struct list_head failed_paths;
			
 
				+};
			
 
				+
			
 
				+struct path_info {
			
 
				+	struct list_head list;
			
 
				+	struct dm_path *path;
			
 
				+	unsigned repeat_count;
			
 
				+	unsigned relative_throughput;
			
 
				+	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
			
 
				+};
			
 
				+
			
 
				+static struct selector *alloc_selector(void)
			
 
				+{
			
 
				+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
			
 
				+
			
 
				+	if (s) {
			
 
				+		INIT_LIST_HEAD(&s->valid_paths);
			
 
				+		INIT_LIST_HEAD(&s->failed_paths);
			
 
				+	}
			
 
				+
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
			
 
				+{
			
 
				+	struct selector *s = alloc_selector();
			
 
				+
			
 
				+	if (!s)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ps->context = s;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void free_paths(struct list_head *paths)
			
 
				+{
			
 
				+	struct path_info *pi, *next;
			
 
				+
			
 
				+	list_for_each_entry_safe(pi, next, paths, list) {
			
 
				+		list_del(&pi->list);
			
 
				+		kfree(pi);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void st_destroy(struct path_selector *ps)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+
			
 
				+	free_paths(&s->valid_paths);
			
 
				+	free_paths(&s->failed_paths);
			
 
				+	kfree(s);
			
 
				+	ps->context = NULL;
			
 
				+}
			
 
				+
			
 
				+static int st_status(struct path_selector *ps, struct dm_path *path,
			
 
				+		     status_type_t type, char *result, unsigned maxlen)
			
 
				+{
			
 
				+	unsigned sz = 0;
			
 
				+	struct path_info *pi;
			
 
				+
			
 
				+	if (!path)
			
 
				+		DMEMIT("0 ");
			
 
				+	else {
			
 
				+		pi = path->pscontext;
			
 
				+
			
 
				+		switch (type) {
			
 
				+		case STATUSTYPE_INFO:
			
 
				+			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
			
 
				+			       pi->relative_throughput);
			
 
				+			break;
			
 
				+		case STATUSTYPE_TABLE:
			
 
				+			DMEMIT("%u %u ", pi->repeat_count,
			
 
				+			       pi->relative_throughput);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return sz;
			
 
				+}
			
 
				+
			
 
				+static int st_add_path(struct path_selector *ps, struct dm_path *path,
			
 
				+		       int argc, char **argv, char **error)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi;
			
 
				+	unsigned repeat_count = ST_MIN_IO;
			
 
				+	unsigned relative_throughput = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Arguments: [<repeat_count> [<relative_throughput>]]
			
 
				+	 * 	<repeat_count>: The number of I/Os before switching path.
			
 
				+	 * 			If not given, default (ST_MIN_IO) is used.
			
 
				+	 * 	<relative_throughput>: The relative throughput value of
			
 
				+	 *			the path among all paths in the path-group.
			
 
				+	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
			
 
				+	 *			If not given, minimum value '1' is used.
			
 
				+	 *			If '0' is given, the path isn't selected while
			
 
				+	 * 			other paths having a positive value are
			
 
				+	 * 			available.
			
 
				+	 */
			
 
				+	if (argc > 2) {
			
 
				+		*error = "service-time ps: incorrect number of arguments";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
			
 
				+		*error = "service-time ps: invalid repeat count";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if ((argc == 2) &&
			
 
				+	    (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
			
 
				+	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
			
 
				+		*error = "service-time ps: invalid relative_throughput value";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/* allocate the path */
			
 
				+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
			
 
				+	if (!pi) {
			
 
				+		*error = "service-time ps: Error allocating path context";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	pi->path = path;
			
 
				+	pi->repeat_count = repeat_count;
			
 
				+	pi->relative_throughput = relative_throughput;
			
 
				+	atomic_set(&pi->in_flight_size, 0);
			
 
				+
			
 
				+	path->pscontext = pi;
			
 
				+
			
 
				+	list_add_tail(&pi->list, &s->valid_paths);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	list_move(&pi->list, &s->failed_paths);
			
 
				+}
			
 
				+
			
 
				+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	list_move_tail(&pi->list, &s->valid_paths);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Compare the estimated service time of 2 paths, pi1 and pi2,
			
 
				+ * for the incoming I/O.
			
 
				+ *
			
 
				+ * Returns:
			
 
				+ * < 0 : pi1 is better
			
 
				+ * 0   : no difference between pi1 and pi2
			
 
				+ * > 0 : pi2 is better
			
 
				+ *
			
 
				+ * Description:
			
 
				+ * Basically, the service time is estimated by:
			
 
				+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
			
 
				+ * To reduce the calculation, some optimizations are made.
			
 
				+ * (See comments inline)
			
 
				+ */
			
 
				+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
			
 
				+			   size_t incoming)
			
 
				+{
			
 
				+	size_t sz1, sz2, st1, st2;
			
 
				+
			
 
				+	sz1 = atomic_read(&pi1->in_flight_size);
			
 
				+	sz2 = atomic_read(&pi2->in_flight_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * Case 1: Both have same throughput value. Choose less loaded path.
			
 
				+	 */
			
 
				+	if (pi1->relative_throughput == pi2->relative_throughput)
			
 
				+		return sz1 - sz2;
			
 
				+
			
 
				+	/*
			
 
				+	 * Case 2a: Both have same load. Choose higher throughput path.
			
 
				+	 * Case 2b: One path has no throughput value. Choose the other one.
			
 
				+	 */
			
 
				+	if (sz1 == sz2 ||
			
 
				+	    !pi1->relative_throughput || !pi2->relative_throughput)
			
 
				+		return pi2->relative_throughput - pi1->relative_throughput;
			
 
				+
			
 
				+	/*
			
 
				+	 * Case 3: Calculate service time. Choose faster path.
			
 
				+	 *         Service time using pi1:
			
 
				+	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
			
 
				+	 *         Service time using pi2:
			
 
				+	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
			
 
				+	 *
			
 
				+	 *         To avoid the division, transform the expression to use
			
 
				+	 *         multiplication.
			
 
				+	 *         Because ->relative_throughput > 0 here, if st1 < st2,
			
 
				+	 *         the expressions below are the same meaning:
			
 
				+	 *             (sz1 + incoming) / pi1->relative_throughput <
			
 
				+	 *                 (sz2 + incoming) / pi2->relative_throughput
			
 
				+	 *             (sz1 + incoming) * pi2->relative_throughput <
			
 
				+	 *                 (sz2 + incoming) * pi1->relative_throughput
			
 
				+	 *         So use the later one.
			
 
				+	 */
			
 
				+	sz1 += incoming;
			
 
				+	sz2 += incoming;
			
 
				+	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
			
 
				+		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
			
 
				+		/*
			
 
				+		 * Size may be too big for multiplying pi->relative_throughput
			
 
				+		 * and overflow.
			
 
				+		 * To avoid the overflow and mis-selection, shift down both.
			
 
				+		 */
			
 
				+		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
			
 
				+		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
			
 
				+	}
			
 
				+	st1 = sz1 * pi2->relative_throughput;
			
 
				+	st2 = sz2 * pi1->relative_throughput;
			
 
				+	if (st1 != st2)
			
 
				+		return st1 - st2;
			
 
				+
			
 
				+	/*
			
 
				+	 * Case 4: Service time is equal. Choose higher throughput path.
			
 
				+	 */
			
 
				+	return pi2->relative_throughput - pi1->relative_throughput;
			
 
				+}
			
 
				+
			
 
				+static struct dm_path *st_select_path(struct path_selector *ps,
			
 
				+				      unsigned *repeat_count, size_t nr_bytes)
			
 
				+{
			
 
				+	struct selector *s = ps->context;
			
 
				+	struct path_info *pi = NULL, *best = NULL;
			
 
				+
			
 
				+	if (list_empty(&s->valid_paths))
			
 
				+		return NULL;
			
 
				+
			
 
				+	/* Change preferred (first in list) path to evenly balance. */
			
 
				+	list_move_tail(s->valid_paths.next, &s->valid_paths);
			
 
				+
			
 
				+	list_for_each_entry(pi, &s->valid_paths, list)
			
 
				+		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
			
 
				+			best = pi;
			
 
				+
			
 
				+	if (!best)
			
 
				+		return NULL;
			
 
				+
			
 
				+	*repeat_count = best->repeat_count;
			
 
				+
			
 
				+	return best->path;
			
 
				+}
			
 
				+
			
 
				+static int st_start_io(struct path_selector *ps, struct dm_path *path,
			
 
				+		       size_t nr_bytes)
			
 
				+{
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	atomic_add(nr_bytes, &pi->in_flight_size);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int st_end_io(struct path_selector *ps, struct dm_path *path,
			
 
				+		     size_t nr_bytes)
			
 
				+{
			
 
				+	struct path_info *pi = path->pscontext;
			
 
				+
			
 
				+	atomic_sub(nr_bytes, &pi->in_flight_size);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct path_selector_type st_ps = {
			
 
				+	.name		= "service-time",
			
 
				+	.module		= THIS_MODULE,
			
 
				+	.table_args	= 2,
			
 
				+	.info_args	= 2,
			
 
				+	.create		= st_create,
			
 
				+	.destroy	= st_destroy,
			
 
				+	.status		= st_status,
			
 
				+	.add_path	= st_add_path,
			
 
				+	.fail_path	= st_fail_path,
			
 
				+	.reinstate_path	= st_reinstate_path,
			
 
				+	.select_path	= st_select_path,
			
 
				+	.start_io	= st_start_io,
			
 
				+	.end_io		= st_end_io,
			
 
				+};
			
 
				+
			
 
				+static int __init dm_st_init(void)
			
 
				+{
			
 
				+	int r = dm_register_path_selector(&st_ps);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("register failed %d", r);
			
 
				+
			
 
				+	DMINFO("version " ST_VERSION " loaded");
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __exit dm_st_exit(void)
			
 
				+{
			
 
				+	int r = dm_unregister_path_selector(&st_ps);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("unregister failed %d", r);
			
 
				+}
			
 
				+
			
 
				+module_init(dm_st_init);
			
 
				+module_exit(dm_st_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
			
 
				+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 
				 	/*
			
 
				 	 * Commit exceptions to disk.
			
 
				 	 */
			
 
				-	if (ps->valid && area_io(ps, WRITE))
			
 
				+	if (ps->valid && area_io(ps, WRITE_BARRIER))
			
 
				 		ps->valid = 0;
			
 
				 
			
 
				 	/*
			
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 
			
 
				 	ti->private = s;
			
 
				 	ti->split_io = s->store->chunk_size;
			
 
				+	ti->num_flush_requests = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 
			
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 
				 	chunk_t chunk;
			
 
				 	struct dm_snap_pending_exception *pe = NULL;
			
 
				 
			
 
				+	if (unlikely(bio_empty_barrier(bio))) {
			
 
				+		bio->bi_bdev = s->store->cow->bdev;
			
 
				+		return DM_MAPIO_REMAPPED;
			
 
				+	}
			
 
				+
			
 
				 	chunk = sector_to_chunk(s->store, bio->bi_sector);
			
 
				 
			
 
				 	/* Full snapshots are not usable */
			
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	ti->private = dev;
			
 
				+	ti->num_flush_requests = 1;
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
 
				 	struct dm_dev *dev = ti->private;
			
 
				 	bio->bi_bdev = dev->bdev;
			
 
				 
			
 
				+	if (unlikely(bio_empty_barrier(bio)))
			
 
				+		return DM_MAPIO_REMAPPED;
			
 
				+
			
 
				 	/* Only tell snapshots if this is a write */
			
 
				 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
			
 
				 }
			
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	sc->stripes = stripes;
			
 
				 	sc->stripe_width = width;
			
 
				 	ti->split_io = chunk_size;
			
 
				+	ti->num_flush_requests = stripes;
			
 
				 
			
 
				 	sc->chunk_mask = ((sector_t) chunk_size) - 1;
			
 
				 	for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
			
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
 
				 		      union map_info *map_context)
			
 
				 {
			
 
				 	struct stripe_c *sc = (struct stripe_c *) ti->private;
			
 
				+	sector_t offset, chunk;
			
 
				+	uint32_t stripe;
			
 
				 
			
 
				-	sector_t offset = bio->bi_sector - ti->begin;
			
 
				-	sector_t chunk = offset >> sc->chunk_shift;
			
 
				-	uint32_t stripe = sector_div(chunk, sc->stripes);
			
 
				+	if (unlikely(bio_empty_barrier(bio))) {
			
 
				+		BUG_ON(map_context->flush_request >= sc->stripes);
			
 
				+		bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
			
 
				+		return DM_MAPIO_REMAPPED;
			
 
				+	}
			
 
				+
			
 
				+	offset = bio->bi_sector - ti->begin;
			
 
				+	chunk = offset >> sc->chunk_shift;
			
 
				+	stripe = sector_div(chunk, sc->stripes);
			
 
				 
			
 
				 	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
			
 
				 	bio->bi_sector = sc->stripe[stripe].physical_start +
			
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+static int stripe_iterate_devices(struct dm_target *ti,
			
 
				+				  iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	struct stripe_c *sc = ti->private;
			
 
				+	int ret = 0;
			
 
				+	unsigned i = 0;
			
 
				+
			
 
				+	do
			
 
				+		ret = fn(ti, sc->stripe[i].dev,
			
 
				+			 sc->stripe[i].physical_start, data);
			
 
				+	while (!ret && ++i < sc->stripes);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static struct target_type stripe_target = {
			
 
				 	.name   = "striped",
			
 
				-	.version = {1, 1, 0},
			
 
				+	.version = {1, 2, 0},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = stripe_ctr,
			
 
				 	.dtr    = stripe_dtr,
			
 
				 	.map    = stripe_map,
			
 
				 	.end_io = stripe_end_io,
			
 
				 	.status = stripe_status,
			
 
				+	.iterate_devices = stripe_iterate_devices,
			
 
				 };
			
 
				 
			
 
				 int __init dm_stripe_init(void)
			
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
 
				 	return strlen(buf);
			
 
				 }
			
 
				 
			
 
				+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
			
 
				+{
			
 
				+	sprintf(buf, "%d\n", dm_suspended(md));
			
 
				+
			
 
				+	return strlen(buf);
			
 
				+}
			
 
				+
			
 
				 static DM_ATTR_RO(name);
			
 
				 static DM_ATTR_RO(uuid);
			
 
				+static DM_ATTR_RO(suspended);
			
 
				 
			
 
				 static struct attribute *dm_attrs[] = {
			
 
				 	&dm_attr_name.attr,
			
 
				 	&dm_attr_uuid.attr,
			
 
				+	&dm_attr_suspended.attr,
			
 
				 	NULL,
			
 
				 };
			
 
				 
			
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -41,6 +41,7 @@
 
				 struct dm_table {
			
 
				 	struct mapped_device *md;
			
 
				 	atomic_t holders;
			
 
				+	unsigned type;
			
 
				 
			
 
				 	/* btree table */
			
 
				 	unsigned int depth;
			
@@ -62,15 +63,11 @@ struct dm_table {
 
				 	/* a list of devices used by this table */
			
 
				 	struct list_head devices;
			
 
				 
			
 
				-	/*
			
 
				-	 * These are optimistic limits taken from all the
			
 
				-	 * targets, some targets will need smaller limits.
			
 
				-	 */
			
 
				-	struct io_restrictions limits;
			
 
				-
			
 
				 	/* events get handed up using this callback */
			
 
				 	void (*event_fn)(void *);
			
 
				 	void *event_context;
			
 
				+
			
 
				+	struct dm_md_mempools *mempools;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -88,43 +85,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Returns the minimum that is _not_ zero, unless both are zero.
			
 
				- */
			
 
				-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
			
 
				-
			
 
				-/*
			
 
				- * Combine two io_restrictions, always taking the lower value.
			
 
				- */
			
 
				-static void combine_restrictions_low(struct io_restrictions *lhs,
			
 
				-				     struct io_restrictions *rhs)
			
 
				-{
			
 
				-	lhs->max_sectors =
			
 
				-		min_not_zero(lhs->max_sectors, rhs->max_sectors);
			
 
				-
			
 
				-	lhs->max_phys_segments =
			
 
				-		min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
			
 
				-
			
 
				-	lhs->max_hw_segments =
			
 
				-		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
			
 
				-
			
 
				-	lhs->logical_block_size = max(lhs->logical_block_size,
			
 
				-				      rhs->logical_block_size);
			
 
				-
			
 
				-	lhs->max_segment_size =
			
 
				-		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
			
 
				-
			
 
				-	lhs->max_hw_sectors =
			
 
				-		min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
			
 
				-
			
 
				-	lhs->seg_boundary_mask =
			
 
				-		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
			
 
				-
			
 
				-	lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
			
 
				-
			
 
				-	lhs->no_cluster |= rhs->no_cluster;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Calculate the index of the child node of the n'th node k'th key.
			
 
				  */
			
@@ -267,6 +227,8 @@ static void free_devices(struct list_head *devices)
 
				 	list_for_each_safe(tmp, next, devices) {
			
 
				 		struct dm_dev_internal *dd =
			
 
				 		    list_entry(tmp, struct dm_dev_internal, list);
			
 
				+		DMWARN("dm_table_destroy: dm_put_device call missing for %s",
			
 
				+		       dd->dm_dev.name);
			
 
				 		kfree(dd);
			
 
				 	}
			
 
				 }
			
@@ -296,12 +258,10 @@ void dm_table_destroy(struct dm_table *t)
 
				 	vfree(t->highs);
			
 
				 
			
 
				 	/* free the device list */
			
 
				-	if (t->devices.next != &t->devices) {
			
 
				-		DMWARN("devices still present during destroy: "
			
 
				-		       "dm_table_remove_device calls missing");
			
 
				-
			
 
				+	if (t->devices.next != &t->devices)
			
 
				 		free_devices(&t->devices);
			
 
				-	}
			
 
				+
			
 
				+	dm_free_md_mempools(t->mempools);
			
 
				 
			
 
				 	kfree(t);
			
 
				 }
			
@@ -385,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 
				 /*
			
 
				  * If possible, this checks an area of a destination device is valid.
			
 
				  */
			
 
				-static int check_device_area(struct dm_dev_internal *dd, sector_t start,
			
 
				-			     sector_t len)
			
 
				+static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
			
 
				+				sector_t start, void *data)
			
 
				 {
			
 
				-	sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT;
			
 
				+	struct queue_limits *limits = data;
			
 
				+	struct block_device *bdev = dev->bdev;
			
 
				+	sector_t dev_size =
			
 
				+		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
			
 
				+	unsigned short logical_block_size_sectors =
			
 
				+		limits->logical_block_size >> SECTOR_SHIFT;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				 
			
 
				 	if (!dev_size)
			
 
				 		return 1;
			
 
				 
			
 
				-	return ((start < dev_size) && (len <= (dev_size - start)));
			
 
				+	if ((start >= dev_size) || (start + ti->len > dev_size)) {
			
 
				+		DMWARN("%s: %s too small for target",
			
 
				+		       dm_device_name(ti->table->md), bdevname(bdev, b));
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (logical_block_size_sectors <= 1)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (start & (logical_block_size_sectors - 1)) {
			
 
				+		DMWARN("%s: start=%llu not aligned to h/w "
			
 
				+		       "logical block size %hu of %s",
			
 
				+		       dm_device_name(ti->table->md),
			
 
				+		       (unsigned long long)start,
			
 
				+		       limits->logical_block_size, bdevname(bdev, b));
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (ti->len & (logical_block_size_sectors - 1)) {
			
 
				+		DMWARN("%s: len=%llu not aligned to h/w "
			
 
				+		       "logical block size %hu of %s",
			
 
				+		       dm_device_name(ti->table->md),
			
 
				+		       (unsigned long long)ti->len,
			
 
				+		       limits->logical_block_size, bdevname(bdev, b));
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -479,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 
				 	}
			
 
				 	atomic_inc(&dd->count);
			
 
				 
			
 
				-	if (!check_device_area(dd, start, len)) {
			
 
				-		DMWARN("device %s too small for target", path);
			
 
				-		dm_put_device(ti, &dd->dm_dev);
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-
			
 
				 	*result = &dd->dm_dev;
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
			
 
				+/*
			
 
				+ * Returns the minimum that is _not_ zero, unless both are zero.
			
 
				+ */
			
 
				+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
			
 
				+
			
 
				+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
			
 
				+			 sector_t start, void *data)
			
 
				 {
			
 
				+	struct queue_limits *limits = data;
			
 
				+	struct block_device *bdev = dev->bdev;
			
 
				 	struct request_queue *q = bdev_get_queue(bdev);
			
 
				-	struct io_restrictions *rs = &ti->limits;
			
 
				 	char b[BDEVNAME_SIZE];
			
 
				 
			
 
				 	if (unlikely(!q)) {
			
 
				 		DMWARN("%s: Cannot set limits for nonexistent device %s",
			
 
				 		       dm_device_name(ti->table->md), bdevname(bdev, b));
			
 
				-		return;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Combine the device limits low.
			
 
				-	 *
			
 
				-	 * FIXME: if we move an io_restriction struct
			
 
				-	 *        into q this would just be a call to
			
 
				-	 *        combine_restrictions_low()
			
 
				-	 */
			
 
				-	rs->max_sectors =
			
 
				-		min_not_zero(rs->max_sectors, queue_max_sectors(q));
			
 
				+	if (blk_stack_limits(limits, &q->limits, start) < 0)
			
 
				+		DMWARN("%s: target device %s is misaligned",
			
 
				+		       dm_device_name(ti->table->md), bdevname(bdev, b));
			
 
				 
			
 
				 	/*
			
 
				 	 * Check if merge fn is supported.
			
@@ -519,48 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 
				 	 */
			
 
				 
			
 
				 	if (q->merge_bvec_fn && !ti->type->merge)
			
 
				-		rs->max_sectors =
			
 
				-			min_not_zero(rs->max_sectors,
			
 
				+		limits->max_sectors =
			
 
				+			min_not_zero(limits->max_sectors,
			
 
				 				     (unsigned int) (PAGE_SIZE >> 9));
			
 
				-
			
 
				-	rs->max_phys_segments =
			
 
				-		min_not_zero(rs->max_phys_segments,
			
 
				-			     queue_max_phys_segments(q));
			
 
				-
			
 
				-	rs->max_hw_segments =
			
 
				-		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
			
 
				-
			
 
				-	rs->logical_block_size = max(rs->logical_block_size,
			
 
				-				     queue_logical_block_size(q));
			
 
				-
			
 
				-	rs->max_segment_size =
			
 
				-		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
			
 
				-
			
 
				-	rs->max_hw_sectors =
			
 
				-		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
			
 
				-
			
 
				-	rs->seg_boundary_mask =
			
 
				-		min_not_zero(rs->seg_boundary_mask,
			
 
				-			     queue_segment_boundary(q));
			
 
				-
			
 
				-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
			
 
				-
			
 
				-	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
			
 
				+	return 0;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_set_device_limits);
			
 
				 
			
 
				 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
			
 
				 		  sector_t len, fmode_t mode, struct dm_dev **result)
			
 
				 {
			
 
				-	int r = __table_get_device(ti->table, ti, path,
			
 
				-				   start, len, mode, result);
			
 
				-
			
 
				-	if (!r)
			
 
				-		dm_set_device_limits(ti, (*result)->bdev);
			
 
				-
			
 
				-	return r;
			
 
				+	return __table_get_device(ti->table, ti, path,
			
 
				+				  start, len, mode, result);
			
 
				 }
			
 
				 
			
 
				+
			
 
				 /*
			
 
				  * Decrement a devices use count and remove it if necessary.
			
 
				  */
			
@@ -675,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void check_for_valid_limits(struct io_restrictions *rs)
			
 
				+/*
			
 
				+ * Impose necessary and sufficient conditions on a devices's table such
			
 
				+ * that any incoming bio which respects its logical_block_size can be
			
 
				+ * processed successfully.  If it falls across the boundary between
			
 
				+ * two or more targets, the size of each piece it gets split into must
			
 
				+ * be compatible with the logical_block_size of the target processing it.
			
 
				+ */
			
 
				+static int validate_hardware_logical_block_alignment(struct dm_table *table,
			
 
				+						 struct queue_limits *limits)
			
 
				 {
			
 
				-	if (!rs->max_sectors)
			
 
				-		rs->max_sectors = SAFE_MAX_SECTORS;
			
 
				-	if (!rs->max_hw_sectors)
			
 
				-		rs->max_hw_sectors = SAFE_MAX_SECTORS;
			
 
				-	if (!rs->max_phys_segments)
			
 
				-		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
			
 
				-	if (!rs->max_hw_segments)
			
 
				-		rs->max_hw_segments = MAX_HW_SEGMENTS;
			
 
				-	if (!rs->logical_block_size)
			
 
				-		rs->logical_block_size = 1 << SECTOR_SHIFT;
			
 
				-	if (!rs->max_segment_size)
			
 
				-		rs->max_segment_size = MAX_SEGMENT_SIZE;
			
 
				-	if (!rs->seg_boundary_mask)
			
 
				-		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
			
 
				-	if (!rs->bounce_pfn)
			
 
				-		rs->bounce_pfn = -1;
			
 
				+	/*
			
 
				+	 * This function uses arithmetic modulo the logical_block_size
			
 
				+	 * (in units of 512-byte sectors).
			
 
				+	 */
			
 
				+	unsigned short device_logical_block_size_sects =
			
 
				+		limits->logical_block_size >> SECTOR_SHIFT;
			
 
				+
			
 
				+	/*
			
 
				+	 * Offset of the start of the next table entry, mod logical_block_size.
			
 
				+	 */
			
 
				+	unsigned short next_target_start = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Given an aligned bio that extends beyond the end of a
			
 
				+	 * target, how many sectors must the next target handle?
			
 
				+	 */
			
 
				+	unsigned short remaining = 0;
			
 
				+
			
 
				+	struct dm_target *uninitialized_var(ti);
			
 
				+	struct queue_limits ti_limits;
			
 
				+	unsigned i = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check each entry in the table in turn.
			
 
				+	 */
			
 
				+	while (i < dm_table_get_num_targets(table)) {
			
 
				+		ti = dm_table_get_target(table, i++);
			
 
				+
			
 
				+		blk_set_default_limits(&ti_limits);
			
 
				+
			
 
				+		/* combine all target devices' limits */
			
 
				+		if (ti->type->iterate_devices)
			
 
				+			ti->type->iterate_devices(ti, dm_set_device_limits,
			
 
				+						  &ti_limits);
			
 
				+
			
 
				+		/*
			
 
				+		 * If the remaining sectors fall entirely within this
			
 
				+		 * table entry are they compatible with its logical_block_size?
			
 
				+		 */
			
 
				+		if (remaining < ti->len &&
			
 
				+		    remaining & ((ti_limits.logical_block_size >>
			
 
				+				  SECTOR_SHIFT) - 1))
			
 
				+			break;	/* Error */
			
 
				+
			
 
				+		next_target_start =
			
 
				+		    (unsigned short) ((next_target_start + ti->len) &
			
 
				+				      (device_logical_block_size_sects - 1));
			
 
				+		remaining = next_target_start ?
			
 
				+		    device_logical_block_size_sects - next_target_start : 0;
			
 
				+	}
			
 
				+
			
 
				+	if (remaining) {
			
 
				+		DMWARN("%s: table line %u (start sect %llu len %llu) "
			
 
				+		       "not aligned to h/w logical block size %hu",
			
 
				+		       dm_device_name(table->md), i,
			
 
				+		       (unsigned long long) ti->begin,
			
 
				+		       (unsigned long long) ti->len,
			
 
				+		       limits->logical_block_size);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 int dm_table_add_target(struct dm_table *t, const char *type,
			
@@ -747,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
				 
			
 
				 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
			
 
				 
			
 
				-	/* FIXME: the plan is to combine high here and then have
			
 
				-	 * the merge fn apply the target level restrictions. */
			
 
				-	combine_restrictions_low(&t->limits, &tgt->limits);
			
 
				 	return 0;
			
 
				 
			
 
				  bad:
			
@@ -758,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+int dm_table_set_type(struct dm_table *t)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	unsigned bio_based = 0, request_based = 0;
			
 
				+	struct dm_target *tgt;
			
 
				+	struct dm_dev_internal *dd;
			
 
				+	struct list_head *devices;
			
 
				+
			
 
				+	for (i = 0; i < t->num_targets; i++) {
			
 
				+		tgt = t->targets + i;
			
 
				+		if (dm_target_request_based(tgt))
			
 
				+			request_based = 1;
			
 
				+		else
			
 
				+			bio_based = 1;
			
 
				+
			
 
				+		if (bio_based && request_based) {
			
 
				+			DMWARN("Inconsistent table: different target types"
			
 
				+			       " can't be mixed up");
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (bio_based) {
			
 
				+		/* We must use this table as bio-based */
			
 
				+		t->type = DM_TYPE_BIO_BASED;
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(!request_based); /* No targets in this table */
			
 
				+
			
 
				+	/* Non-request-stackable devices can't be used for request-based dm */
			
 
				+	devices = dm_table_get_devices(t);
			
 
				+	list_for_each_entry(dd, devices, list) {
			
 
				+		if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
			
 
				+			DMWARN("table load rejected: including"
			
 
				+			       " non-request-stackable devices");
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Request-based dm supports only tables that have a single target now.
			
 
				+	 * To support multiple targets, request splitting support is needed,
			
 
				+	 * and that needs lots of changes in the block-layer.
			
 
				+	 * (e.g. request completion process for partial completion.)
			
 
				+	 */
			
 
				+	if (t->num_targets > 1) {
			
 
				+		DMWARN("Request-based dm doesn't support multiple targets yet");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	t->type = DM_TYPE_REQUEST_BASED;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+unsigned dm_table_get_type(struct dm_table *t)
			
 
				+{
			
 
				+	return t->type;
			
 
				+}
			
 
				+
			
 
				+bool dm_table_bio_based(struct dm_table *t)
			
 
				+{
			
 
				+	return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
			
 
				+}
			
 
				+
			
 
				+bool dm_table_request_based(struct dm_table *t)
			
 
				+{
			
 
				+	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
			
 
				+}
			
 
				+
			
 
				+int dm_table_alloc_md_mempools(struct dm_table *t)
			
 
				+{
			
 
				+	unsigned type = dm_table_get_type(t);
			
 
				+
			
 
				+	if (unlikely(type == DM_TYPE_NONE)) {
			
 
				+		DMWARN("no table type is set, can't allocate mempools");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	t->mempools = dm_alloc_md_mempools(type);
			
 
				+	if (!t->mempools)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void dm_table_free_md_mempools(struct dm_table *t)
			
 
				+{
			
 
				+	dm_free_md_mempools(t->mempools);
			
 
				+	t->mempools = NULL;
			
 
				+}
			
 
				+
			
 
				+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
			
 
				+{
			
 
				+	return t->mempools;
			
 
				+}
			
 
				+
			
 
				 static int setup_indexes(struct dm_table *t)
			
 
				 {
			
 
				 	int i;
			
@@ -792,8 +901,6 @@ int dm_table_complete(struct dm_table *t)
 
				 	int r = 0;
			
 
				 	unsigned int leaf_nodes;
			
 
				 
			
 
				-	check_for_valid_limits(&t->limits);
			
 
				-
			
 
				 	/* how many indexes will the btree have ? */
			
 
				 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
			
 
				 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
			
@@ -868,6 +975,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 
				 	return &t->targets[(KEYS_PER_NODE * n) + k];
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Establish the new table's queue_limits and validate them.
			
 
				+ */
			
 
				+int dm_calculate_queue_limits(struct dm_table *table,
			
 
				+			      struct queue_limits *limits)
			
 
				+{
			
 
				+	struct dm_target *uninitialized_var(ti);
			
 
				+	struct queue_limits ti_limits;
			
 
				+	unsigned i = 0;
			
 
				+
			
 
				+	blk_set_default_limits(limits);
			
 
				+
			
 
				+	while (i < dm_table_get_num_targets(table)) {
			
 
				+		blk_set_default_limits(&ti_limits);
			
 
				+
			
 
				+		ti = dm_table_get_target(table, i++);
			
 
				+
			
 
				+		if (!ti->type->iterate_devices)
			
 
				+			goto combine_limits;
			
 
				+
			
 
				+		/*
			
 
				+		 * Combine queue limits of all the devices this target uses.
			
 
				+		 */
			
 
				+		ti->type->iterate_devices(ti, dm_set_device_limits,
			
 
				+					  &ti_limits);
			
 
				+
			
 
				+		/*
			
 
				+		 * Check each device area is consistent with the target's
			
 
				+		 * overall queue limits.
			
 
				+		 */
			
 
				+		if (!ti->type->iterate_devices(ti, device_area_is_valid,
			
 
				+					       &ti_limits))
			
 
				+			return -EINVAL;
			
 
				+
			
 
				+combine_limits:
			
 
				+		/*
			
 
				+		 * Merge this target's queue limits into the overall limits
			
 
				+		 * for the table.
			
 
				+		 */
			
 
				+		if (blk_stack_limits(limits, &ti_limits, 0) < 0)
			
 
				+			DMWARN("%s: target device "
			
 
				+			       "(start sect %llu len %llu) "
			
 
				+			       "is misaligned",
			
 
				+			       dm_device_name(table->md),
			
 
				+			       (unsigned long long) ti->begin,
			
 
				+			       (unsigned long long) ti->len);
			
 
				+	}
			
 
				+
			
 
				+	return validate_hardware_logical_block_alignment(table, limits);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Set the integrity profile for this device if all devices used have
			
 
				  * matching profiles.
			
@@ -907,27 +1065,42 @@ static void dm_table_set_integrity(struct dm_table *t)
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
			
 
				+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
			
 
				+			       struct queue_limits *limits)
			
 
				 {
			
 
				 	/*
			
 
				-	 * Make sure we obey the optimistic sub devices
			
 
				-	 * restrictions.
			
 
				+	 * Each target device in the table has a data area that should normally
			
 
				+	 * be aligned such that the DM device's alignment_offset is 0.
			
 
				+	 * FIXME: Propagate alignment_offsets up the stack and warn of
			
 
				+	 *	  sub-optimal or inconsistent settings.
			
 
				+	 */
			
 
				+	limits->alignment_offset = 0;
			
 
				+	limits->misaligned = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Copy table's limits to the DM device's request_queue
			
 
				 	 */
			
 
				-	blk_queue_max_sectors(q, t->limits.max_sectors);
			
 
				-	blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
			
 
				-	blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
			
 
				-	blk_queue_logical_block_size(q, t->limits.logical_block_size);
			
 
				-	blk_queue_max_segment_size(q, t->limits.max_segment_size);
			
 
				-	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
			
 
				-	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
			
 
				-	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
			
 
				-
			
 
				-	if (t->limits.no_cluster)
			
 
				+	q->limits = *limits;
			
 
				+
			
 
				+	if (limits->no_cluster)
			
 
				 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
			
 
				 	else
			
 
				 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
			
 
				 
			
 
				 	dm_table_set_integrity(t);
			
 
				+
			
 
				+	/*
			
 
				+	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
			
 
				+	 * visible to other CPUs because, once the flag is set, incoming bios
			
 
				+	 * are processed by request-based dm, which refers to the queue
			
 
				+	 * settings.
			
 
				+	 * Until the flag set, bios are passed to bio-based dm and queued to
			
 
				+	 * md->deferred where queue settings are not needed yet.
			
 
				+	 * Those bios are passed to request-based dm at the resume time.
			
 
				+	 */
			
 
				+	smp_mb();
			
 
				+	if (dm_table_request_based(t))
			
 
				+		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
			
 
				 }
			
 
				 
			
 
				 unsigned int dm_table_get_num_targets(struct dm_table *t)
			
@@ -1023,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+int dm_table_any_busy_target(struct dm_table *t)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	struct dm_target *ti;
			
 
				+
			
 
				+	for (i = 0; i < t->num_targets; i++) {
			
 
				+		ti = t->targets + i;
			
 
				+		if (ti->type->busy && ti->type->busy(ti))
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 void dm_table_unplug_all(struct dm_table *t)
			
 
				 {
			
 
				 	struct dm_dev_internal *dd;
			
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
 
				 
			
 
				 #define DM_MSG_PREFIX "core"
			
 
				 
			
 
				+/*
			
 
				+ * Cookies are numeric values sent with CHANGE and REMOVE
			
 
				+ * uevents while resuming, removing or renaming the device.
			
 
				+ */
			
 
				+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
			
 
				+#define DM_COOKIE_LENGTH 24
			
 
				+
			
 
				 static const char *_name = DM_NAME;
			
 
				 
			
 
				 static unsigned int major = 0;
			
@@ -71,7 +78,7 @@ struct dm_rq_target_io {
 
				  */
			
 
				 struct dm_rq_clone_bio_info {
			
 
				 	struct bio *orig;
			
 
				-	struct request *rq;
			
 
				+	struct dm_rq_target_io *tio;
			
 
				 };
			
 
				 
			
 
				 union map_info *dm_get_mapinfo(struct bio *bio)
			
@@ -81,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+union map_info *dm_get_rq_mapinfo(struct request *rq)
			
 
				+{
			
 
				+	if (rq && rq->end_io_data)
			
 
				+		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
			
 
				+	return NULL;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
			
 
				+
			
 
				 #define MINOR_ALLOCED ((void *)-1)
			
 
				 
			
 
				 /*
			
@@ -157,13 +172,31 @@ struct mapped_device {
 
				 	 * freeze/thaw support require holding onto a super block
			
 
				 	 */
			
 
				 	struct super_block *frozen_sb;
			
 
				-	struct block_device *suspended_bdev;
			
 
				+	struct block_device *bdev;
			
 
				 
			
 
				 	/* forced geometry settings */
			
 
				 	struct hd_geometry geometry;
			
 
				 
			
 
				+	/* marker of flush suspend for request-based dm */
			
 
				+	struct request suspend_rq;
			
 
				+
			
 
				+	/* For saving the address of __make_request for request based dm */
			
 
				+	make_request_fn *saved_make_request_fn;
			
 
				+
			
 
				 	/* sysfs handle */
			
 
				 	struct kobject kobj;
			
 
				+
			
 
				+	/* zero-length barrier that will be cloned and submitted to targets */
			
 
				+	struct bio barrier_bio;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * For mempools pre-allocation at the table loading time.
			
 
				+ */
			
 
				+struct dm_md_mempools {
			
 
				+	mempool_t *io_pool;
			
 
				+	mempool_t *tio_pool;
			
 
				+	struct bio_set *bs;
			
 
				 };
			
 
				 
			
 
				 #define MIN_IOS 256
			
@@ -391,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 
				 	mempool_free(io, md->io_pool);
			
 
				 }
			
 
				 
			
 
				-static struct dm_target_io *alloc_tio(struct mapped_device *md)
			
 
				+static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
			
 
				 {
			
 
				-	return mempool_alloc(md->tio_pool, GFP_NOIO);
			
 
				+	mempool_free(tio, md->tio_pool);
			
 
				 }
			
 
				 
			
 
				-static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
			
 
				+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
			
 
				 {
			
 
				-	mempool_free(tio, md->tio_pool);
			
 
				+	return mempool_alloc(md->tio_pool, GFP_ATOMIC);
			
 
				+}
			
 
				+
			
 
				+static void free_rq_tio(struct dm_rq_target_io *tio)
			
 
				+{
			
 
				+	mempool_free(tio, tio->md->tio_pool);
			
 
				+}
			
 
				+
			
 
				+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
			
 
				+{
			
 
				+	return mempool_alloc(md->io_pool, GFP_ATOMIC);
			
 
				+}
			
 
				+
			
 
				+static void free_bio_info(struct dm_rq_clone_bio_info *info)
			
 
				+{
			
 
				+	mempool_free(info, info->tio->md->io_pool);
			
 
				 }
			
 
				 
			
 
				 static void start_io_acct(struct dm_io *io)
			
@@ -464,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 
				 struct dm_table *dm_get_table(struct mapped_device *md)
			
 
				 {
			
 
				 	struct dm_table *t;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-	read_lock(&md->map_lock);
			
 
				+	read_lock_irqsave(&md->map_lock, flags);
			
 
				 	t = md->map;
			
 
				 	if (t)
			
 
				 		dm_table_get(t);
			
 
				-	read_unlock(&md->map_lock);
			
 
				+	read_unlock_irqrestore(&md->map_lock, flags);
			
 
				 
			
 
				 	return t;
			
 
				 }
			
@@ -536,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error)
 
				 			 * Target requested pushing back the I/O.
			
 
				 			 */
			
 
				 			spin_lock_irqsave(&md->deferred_lock, flags);
			
 
				-			if (__noflush_suspending(md))
			
 
				-				bio_list_add_head(&md->deferred, io->bio);
			
 
				-			else
			
 
				+			if (__noflush_suspending(md)) {
			
 
				+				if (!bio_barrier(io->bio))
			
 
				+					bio_list_add_head(&md->deferred,
			
 
				+							  io->bio);
			
 
				+			} else
			
 
				 				/* noflush suspend was interrupted. */
			
 
				 				io->error = -EIO;
			
 
				 			spin_unlock_irqrestore(&md->deferred_lock, flags);
			
@@ -553,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error)
 
				 			 * a per-device variable for error reporting.
			
 
				 			 * Note that you can't touch the bio after end_io_acct
			
 
				 			 */
			
 
				-			md->barrier_error = io_error;
			
 
				+			if (!md->barrier_error && io_error != -EOPNOTSUPP)
			
 
				+				md->barrier_error = io_error;
			
 
				 			end_io_acct(io);
			
 
				 		} else {
			
 
				 			end_io_acct(io);
			
@@ -607,6 +659,262 @@ static void clone_endio(struct bio *bio, int error)
 
				 	dec_pending(io, error);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Partial completion handling for request-based dm
			
 
				+ */
			
 
				+static void end_clone_bio(struct bio *clone, int error)
			
 
				+{
			
 
				+	struct dm_rq_clone_bio_info *info = clone->bi_private;
			
 
				+	struct dm_rq_target_io *tio = info->tio;
			
 
				+	struct bio *bio = info->orig;
			
 
				+	unsigned int nr_bytes = info->orig->bi_size;
			
 
				+
			
 
				+	bio_put(clone);
			
 
				+
			
 
				+	if (tio->error)
			
 
				+		/*
			
 
				+		 * An error has already been detected on the request.
			
 
				+		 * Once error occurred, just let clone->end_io() handle
			
 
				+		 * the remainder.
			
 
				+		 */
			
 
				+		return;
			
 
				+	else if (error) {
			
 
				+		/*
			
 
				+		 * Don't notice the error to the upper layer yet.
			
 
				+		 * The error handling decision is made by the target driver,
			
 
				+		 * when the request is completed.
			
 
				+		 */
			
 
				+		tio->error = error;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * I/O for the bio successfully completed.
			
 
				+	 * Notice the data completion to the upper layer.
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * bios are processed from the head of the list.
			
 
				+	 * So the completing bio should always be rq->bio.
			
 
				+	 * If it's not, something wrong is happening.
			
 
				+	 */
			
 
				+	if (tio->orig->bio != bio)
			
 
				+		DMERR("bio completion is going in the middle of the request");
			
 
				+
			
 
				+	/*
			
 
				+	 * Update the original request.
			
 
				+	 * Do not use blk_end_request() here, because it may complete
			
 
				+	 * the original request before the clone, and break the ordering.
			
 
				+	 */
			
 
				+	blk_update_request(tio->orig, 0, nr_bytes);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Don't touch any member of the md after calling this function because
			
 
				+ * the md may be freed in dm_put() at the end of this function.
			
 
				+ * Or do dm_get() before calling this function and dm_put() later.
			
 
				+ */
			
 
				+static void rq_completed(struct mapped_device *md, int run_queue)
			
 
				+{
			
 
				+	int wakeup_waiters = 0;
			
 
				+	struct request_queue *q = md->queue;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(q->queue_lock, flags);
			
 
				+	if (!queue_in_flight(q))
			
 
				+		wakeup_waiters = 1;
			
 
				+	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+
			
 
				+	/* nudge anyone waiting on suspend queue */
			
 
				+	if (wakeup_waiters)
			
 
				+		wake_up(&md->wait);
			
 
				+
			
 
				+	if (run_queue)
			
 
				+		blk_run_queue(q);
			
 
				+
			
 
				+	/*
			
 
				+	 * dm_put() must be at the end of this function. See the comment above
			
 
				+	 */
			
 
				+	dm_put(md);
			
 
				+}
			
 
				+
			
 
				+static void dm_unprep_request(struct request *rq)
			
 
				+{
			
 
				+	struct request *clone = rq->special;
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+
			
 
				+	rq->special = NULL;
			
 
				+	rq->cmd_flags &= ~REQ_DONTPREP;
			
 
				+
			
 
				+	blk_rq_unprep_clone(clone);
			
 
				+	free_rq_tio(tio);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Requeue the original request of a clone.
			
 
				+ */
			
 
				+void dm_requeue_unmapped_request(struct request *clone)
			
 
				+{
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+	struct mapped_device *md = tio->md;
			
 
				+	struct request *rq = tio->orig;
			
 
				+	struct request_queue *q = rq->q;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	dm_unprep_request(rq);
			
 
				+
			
 
				+	spin_lock_irqsave(q->queue_lock, flags);
			
 
				+	if (elv_queue_empty(q))
			
 
				+		blk_plug_device(q);
			
 
				+	blk_requeue_request(q, rq);
			
 
				+	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+
			
 
				+	rq_completed(md, 0);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
			
 
				+
			
 
				+static void __stop_queue(struct request_queue *q)
			
 
				+{
			
 
				+	blk_stop_queue(q);
			
 
				+}
			
 
				+
			
 
				+static void stop_queue(struct request_queue *q)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(q->queue_lock, flags);
			
 
				+	__stop_queue(q);
			
 
				+	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void __start_queue(struct request_queue *q)
			
 
				+{
			
 
				+	if (blk_queue_stopped(q))
			
 
				+		blk_start_queue(q);
			
 
				+}
			
 
				+
			
 
				+static void start_queue(struct request_queue *q)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(q->queue_lock, flags);
			
 
				+	__start_queue(q);
			
 
				+	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Complete the clone and the original request.
			
 
				+ * Must be called without queue lock.
			
 
				+ */
			
 
				+static void dm_end_request(struct request *clone, int error)
			
 
				+{
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+	struct mapped_device *md = tio->md;
			
 
				+	struct request *rq = tio->orig;
			
 
				+
			
 
				+	if (blk_pc_request(rq)) {
			
 
				+		rq->errors = clone->errors;
			
 
				+		rq->resid_len = clone->resid_len;
			
 
				+
			
 
				+		if (rq->sense)
			
 
				+			/*
			
 
				+			 * We are using the sense buffer of the original
			
 
				+			 * request.
			
 
				+			 * So setting the length of the sense data is enough.
			
 
				+			 */
			
 
				+			rq->sense_len = clone->sense_len;
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(clone->bio);
			
 
				+	free_rq_tio(tio);
			
 
				+
			
 
				+	blk_end_request_all(rq, error);
			
 
				+
			
 
				+	rq_completed(md, 1);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Request completion handler for request-based dm
			
 
				+ */
			
 
				+static void dm_softirq_done(struct request *rq)
			
 
				+{
			
 
				+	struct request *clone = rq->completion_data;
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
			
 
				+	int error = tio->error;
			
 
				+
			
 
				+	if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
			
 
				+		error = rq_end_io(tio->ti, clone, error, &tio->info);
			
 
				+
			
 
				+	if (error <= 0)
			
 
				+		/* The target wants to complete the I/O */
			
 
				+		dm_end_request(clone, error);
			
 
				+	else if (error == DM_ENDIO_INCOMPLETE)
			
 
				+		/* The target will handle the I/O */
			
 
				+		return;
			
 
				+	else if (error == DM_ENDIO_REQUEUE)
			
 
				+		/* The target wants to requeue the I/O */
			
 
				+		dm_requeue_unmapped_request(clone);
			
 
				+	else {
			
 
				+		DMWARN("unimplemented target endio return value: %d", error);
			
 
				+		BUG();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Complete the clone and the original request with the error status
			
 
				+ * through softirq context.
			
 
				+ */
			
 
				+static void dm_complete_request(struct request *clone, int error)
			
 
				+{
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+	struct request *rq = tio->orig;
			
 
				+
			
 
				+	tio->error = error;
			
 
				+	rq->completion_data = clone;
			
 
				+	blk_complete_request(rq);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Complete the not-mapped clone and the original request with the error status
			
 
				+ * through softirq context.
			
 
				+ * Target's rq_end_io() function isn't called.
			
 
				+ * This may be used when the target's map_rq() function fails.
			
 
				+ */
			
 
				+void dm_kill_unmapped_request(struct request *clone, int error)
			
 
				+{
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+	struct request *rq = tio->orig;
			
 
				+
			
 
				+	rq->cmd_flags |= REQ_FAILED;
			
 
				+	dm_complete_request(clone, error);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
			
 
				+
			
 
				+/*
			
 
				+ * Called with the queue lock held
			
 
				+ */
			
 
				+static void end_clone_request(struct request *clone, int error)
			
 
				+{
			
 
				+	/*
			
 
				+	 * For just cleaning up the information of the queue in which
			
 
				+	 * the clone was dispatched.
			
 
				+	 * The clone is *NOT* freed actually here because it is alloced from
			
 
				+	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
			
 
				+	 */
			
 
				+	__blk_put_request(clone->q, clone);
			
 
				+
			
 
				+	/*
			
 
				+	 * Actual request completion is done in a softirq context which doesn't
			
 
				+	 * hold the queue lock.  Otherwise, deadlock could occur because:
			
 
				+	 *     - another request may be submitted by the upper level driver
			
 
				+	 *       of the stacking during the completion
			
 
				+	 *     - the submission which requires queue lock may be done
			
 
				+	 *       against this queue
			
 
				+	 */
			
 
				+	dm_complete_request(clone, error);
			
 
				+}
			
 
				+
			
 
				 static sector_t max_io_len(struct mapped_device *md,
			
 
				 			   sector_t sector, struct dm_target *ti)
			
 
				 {
			
@@ -634,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 
				 	sector_t sector;
			
 
				 	struct mapped_device *md;
			
 
				 
			
 
				-	/*
			
 
				-	 * Sanity checks.
			
 
				-	 */
			
 
				-	BUG_ON(!clone->bi_size);
			
 
				-
			
 
				 	clone->bi_end_io = clone_endio;
			
 
				 	clone->bi_private = tio;
			
 
				 
			
@@ -752,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
				 	return clone;
			
 
				 }
			
 
				 
			
 
				+static struct dm_target_io *alloc_tio(struct clone_info *ci,
			
 
				+				      struct dm_target *ti)
			
 
				+{
			
 
				+	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
			
 
				+
			
 
				+	tio->io = ci->io;
			
 
				+	tio->ti = ti;
			
 
				+	memset(&tio->info, 0, sizeof(tio->info));
			
 
				+
			
 
				+	return tio;
			
 
				+}
			
 
				+
			
 
				+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
			
 
				+			  unsigned flush_nr)
			
 
				+{
			
 
				+	struct dm_target_io *tio = alloc_tio(ci, ti);
			
 
				+	struct bio *clone;
			
 
				+
			
 
				+	tio->info.flush_request = flush_nr;
			
 
				+
			
 
				+	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
			
 
				+	__bio_clone(clone, ci->bio);
			
 
				+	clone->bi_destructor = dm_bio_destructor;
			
 
				+
			
 
				+	__map_bio(ti, clone, tio);
			
 
				+}
			
 
				+
			
 
				+static int __clone_and_map_empty_barrier(struct clone_info *ci)
			
 
				+{
			
 
				+	unsigned target_nr = 0, flush_nr;
			
 
				+	struct dm_target *ti;
			
 
				+
			
 
				+	while ((ti = dm_table_get_target(ci->map, target_nr++)))
			
 
				+		for (flush_nr = 0; flush_nr < ti->num_flush_requests;
			
 
				+		     flush_nr++)
			
 
				+			__flush_target(ci, ti, flush_nr);
			
 
				+
			
 
				+	ci->sector_count = 0;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int __clone_and_map(struct clone_info *ci)
			
 
				 {
			
 
				 	struct bio *clone, *bio = ci->bio;
			
@@ -759,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci)
 
				 	sector_t len = 0, max;
			
 
				 	struct dm_target_io *tio;
			
 
				 
			
 
				+	if (unlikely(bio_empty_barrier(bio)))
			
 
				+		return __clone_and_map_empty_barrier(ci);
			
 
				+
			
 
				 	ti = dm_table_find_target(ci->map, ci->sector);
			
 
				 	if (!dm_target_is_valid(ti))
			
 
				 		return -EIO;
			
@@ -768,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci)
 
				 	/*
			
 
				 	 * Allocate a target io object.
			
 
				 	 */
			
 
				-	tio = alloc_tio(ci->md);
			
 
				-	tio->io = ci->io;
			
 
				-	tio->ti = ti;
			
 
				-	memset(&tio->info, 0, sizeof(tio->info));
			
 
				+	tio = alloc_tio(ci, ti);
			
 
				 
			
 
				 	if (ci->sector_count <= max) {
			
 
				 		/*
			
@@ -827,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci)
 
				 
			
 
				 				max = max_io_len(ci->md, ci->sector, ti);
			
 
				 
			
 
				-				tio = alloc_tio(ci->md);
			
 
				-				tio->io = ci->io;
			
 
				-				tio->ti = ti;
			
 
				-				memset(&tio->info, 0, sizeof(tio->info));
			
 
				+				tio = alloc_tio(ci, ti);
			
 
				 			}
			
 
				 
			
 
				 			len = min(remaining, max);
			
@@ -865,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
				 		if (!bio_barrier(bio))
			
 
				 			bio_io_error(bio);
			
 
				 		else
			
 
				-			md->barrier_error = -EIO;
			
 
				+			if (!md->barrier_error)
			
 
				+				md->barrier_error = -EIO;
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -878,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
				 	ci.io->md = md;
			
 
				 	ci.sector = bio->bi_sector;
			
 
				 	ci.sector_count = bio_sectors(bio);
			
 
				+	if (unlikely(bio_empty_barrier(bio)))
			
 
				+		ci.sector_count = 1;
			
 
				 	ci.idx = bio->bi_idx;
			
 
				 
			
 
				 	start_io_acct(ci.io);
			
@@ -925,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q,
 
				 	 */
			
 
				 	if (max_size && ti->type->merge)
			
 
				 		max_size = ti->type->merge(ti, bvm, biovec, max_size);
			
 
				+	/*
			
 
				+	 * If the target doesn't support merge method and some of the devices
			
 
				+	 * provided their merge_bvec method (we know this by looking at
			
 
				+	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
			
 
				+	 * entries.  So always set max_size to 0, and the code below allows
			
 
				+	 * just one page.
			
 
				+	 */
			
 
				+	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
			
 
				+
			
 
				+		max_size = 0;
			
 
				 
			
 
				 out_table:
			
 
				 	dm_table_put(map);
			
@@ -943,7 +1298,7 @@ static int dm_merge_bvec(struct request_queue *q,
 
				  * The request function that just remaps the bio built up by
			
 
				  * dm_merge_bvec.
			
 
				  */
			
 
				-static int dm_request(struct request_queue *q, struct bio *bio)
			
 
				+static int _dm_request(struct request_queue *q, struct bio *bio)
			
 
				 {
			
 
				 	int rw = bio_data_dir(bio);
			
 
				 	struct mapped_device *md = q->queuedata;
			
@@ -980,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int dm_make_request(struct request_queue *q, struct bio *bio)
			
 
				+{
			
 
				+	struct mapped_device *md = q->queuedata;
			
 
				+
			
 
				+	if (unlikely(bio_barrier(bio))) {
			
 
				+		bio_endio(bio, -EOPNOTSUPP);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return md->saved_make_request_fn(q, bio); /* call __make_request() */
			
 
				+}
			
 
				+
			
 
				+static int dm_request_based(struct mapped_device *md)
			
 
				+{
			
 
				+	return blk_queue_stackable(md->queue);
			
 
				+}
			
 
				+
			
 
				+static int dm_request(struct request_queue *q, struct bio *bio)
			
 
				+{
			
 
				+	struct mapped_device *md = q->queuedata;
			
 
				+
			
 
				+	if (dm_request_based(md))
			
 
				+		return dm_make_request(q, bio);
			
 
				+
			
 
				+	return _dm_request(q, bio);
			
 
				+}
			
 
				+
			
 
				+void dm_dispatch_request(struct request *rq)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (blk_queue_io_stat(rq->q))
			
 
				+		rq->cmd_flags |= REQ_IO_STAT;
			
 
				+
			
 
				+	rq->start_time = jiffies;
			
 
				+	r = blk_insert_cloned_request(rq->q, rq);
			
 
				+	if (r)
			
 
				+		dm_complete_request(rq, r);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_dispatch_request);
			
 
				+
			
 
				+static void dm_rq_bio_destructor(struct bio *bio)
			
 
				+{
			
 
				+	struct dm_rq_clone_bio_info *info = bio->bi_private;
			
 
				+	struct mapped_device *md = info->tio->md;
			
 
				+
			
 
				+	free_bio_info(info);
			
 
				+	bio_free(bio, md->bs);
			
 
				+}
			
 
				+
			
 
				+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
			
 
				+				 void *data)
			
 
				+{
			
 
				+	struct dm_rq_target_io *tio = data;
			
 
				+	struct mapped_device *md = tio->md;
			
 
				+	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
			
 
				+
			
 
				+	if (!info)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	info->orig = bio_orig;
			
 
				+	info->tio = tio;
			
 
				+	bio->bi_end_io = end_clone_bio;
			
 
				+	bio->bi_private = info;
			
 
				+	bio->bi_destructor = dm_rq_bio_destructor;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int setup_clone(struct request *clone, struct request *rq,
			
 
				+		       struct dm_rq_target_io *tio)
			
 
				+{
			
 
				+	int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
			
 
				+				  dm_rq_bio_constructor, tio);
			
 
				+
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	clone->cmd = rq->cmd;
			
 
				+	clone->cmd_len = rq->cmd_len;
			
 
				+	clone->sense = rq->sense;
			
 
				+	clone->buffer = rq->buffer;
			
 
				+	clone->end_io = end_clone_request;
			
 
				+	clone->end_io_data = tio;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int dm_rq_flush_suspending(struct mapped_device *md)
			
 
				+{
			
 
				+	return !md->suspend_rq.special;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called with the queue lock held.
			
 
				+ */
			
 
				+static int dm_prep_fn(struct request_queue *q, struct request *rq)
			
 
				+{
			
 
				+	struct mapped_device *md = q->queuedata;
			
 
				+	struct dm_rq_target_io *tio;
			
 
				+	struct request *clone;
			
 
				+
			
 
				+	if (unlikely(rq == &md->suspend_rq)) {
			
 
				+		if (dm_rq_flush_suspending(md))
			
 
				+			return BLKPREP_OK;
			
 
				+		else
			
 
				+			/* The flush suspend was interrupted */
			
 
				+			return BLKPREP_KILL;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(rq->special)) {
			
 
				+		DMWARN("Already has something in rq->special.");
			
 
				+		return BLKPREP_KILL;
			
 
				+	}
			
 
				+
			
 
				+	tio = alloc_rq_tio(md); /* Only one for each original request */
			
 
				+	if (!tio)
			
 
				+		/* -ENOMEM */
			
 
				+		return BLKPREP_DEFER;
			
 
				+
			
 
				+	tio->md = md;
			
 
				+	tio->ti = NULL;
			
 
				+	tio->orig = rq;
			
 
				+	tio->error = 0;
			
 
				+	memset(&tio->info, 0, sizeof(tio->info));
			
 
				+
			
 
				+	clone = &tio->clone;
			
 
				+	if (setup_clone(clone, rq, tio)) {
			
 
				+		/* -ENOMEM */
			
 
				+		free_rq_tio(tio);
			
 
				+		return BLKPREP_DEFER;
			
 
				+	}
			
 
				+
			
 
				+	rq->special = clone;
			
 
				+	rq->cmd_flags |= REQ_DONTPREP;
			
 
				+
			
 
				+	return BLKPREP_OK;
			
 
				+}
			
 
				+
			
 
				+static void map_request(struct dm_target *ti, struct request *rq,
			
 
				+			struct mapped_device *md)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct request *clone = rq->special;
			
 
				+	struct dm_rq_target_io *tio = clone->end_io_data;
			
 
				+
			
 
				+	/*
			
 
				+	 * Hold the md reference here for the in-flight I/O.
			
 
				+	 * We can't rely on the reference count by device opener,
			
 
				+	 * because the device may be closed during the request completion
			
 
				+	 * when all bios are completed.
			
 
				+	 * See the comment in rq_completed() too.
			
 
				+	 */
			
 
				+	dm_get(md);
			
 
				+
			
 
				+	tio->ti = ti;
			
 
				+	r = ti->type->map_rq(ti, clone, &tio->info);
			
 
				+	switch (r) {
			
 
				+	case DM_MAPIO_SUBMITTED:
			
 
				+		/* The target has taken the I/O to submit by itself later */
			
 
				+		break;
			
 
				+	case DM_MAPIO_REMAPPED:
			
 
				+		/* The target has remapped the I/O so dispatch it */
			
 
				+		dm_dispatch_request(clone);
			
 
				+		break;
			
 
				+	case DM_MAPIO_REQUEUE:
			
 
				+		/* The target wants to requeue the I/O */
			
 
				+		dm_requeue_unmapped_request(clone);
			
 
				+		break;
			
 
				+	default:
			
 
				+		if (r > 0) {
			
 
				+			DMWARN("unimplemented target map return value: %d", r);
			
 
				+			BUG();
			
 
				+		}
			
 
				+
			
 
				+		/* The target wants to complete the I/O */
			
 
				+		dm_kill_unmapped_request(clone, r);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * q->request_fn for request-based dm.
			
 
				+ * Called with the queue lock held.
			
 
				+ */
			
 
				+static void dm_request_fn(struct request_queue *q)
			
 
				+{
			
 
				+	struct mapped_device *md = q->queuedata;
			
 
				+	struct dm_table *map = dm_get_table(md);
			
 
				+	struct dm_target *ti;
			
 
				+	struct request *rq;
			
 
				+
			
 
				+	/*
			
 
				+	 * For noflush suspend, check blk_queue_stopped() to immediately
			
 
				+	 * quit I/O dispatching.
			
 
				+	 */
			
 
				+	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
			
 
				+		rq = blk_peek_request(q);
			
 
				+		if (!rq)
			
 
				+			goto plug_and_out;
			
 
				+
			
 
				+		if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
			
 
				+			if (queue_in_flight(q))
			
 
				+				/* Not quiet yet.  Wait more */
			
 
				+				goto plug_and_out;
			
 
				+
			
 
				+			/* This device should be quiet now */
			
 
				+			__stop_queue(q);
			
 
				+			blk_start_request(rq);
			
 
				+			__blk_end_request_all(rq, 0);
			
 
				+			wake_up(&md->wait);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		ti = dm_table_find_target(map, blk_rq_pos(rq));
			
 
				+		if (ti->type->busy && ti->type->busy(ti))
			
 
				+			goto plug_and_out;
			
 
				+
			
 
				+		blk_start_request(rq);
			
 
				+		spin_unlock(q->queue_lock);
			
 
				+		map_request(ti, rq, md);
			
 
				+		spin_lock_irq(q->queue_lock);
			
 
				+	}
			
 
				+
			
 
				+	goto out;
			
 
				+
			
 
				+plug_and_out:
			
 
				+	if (!elv_queue_empty(q))
			
 
				+		/* Some requests still remain, retry later */
			
 
				+		blk_plug_device(q);
			
 
				+
			
 
				+out:
			
 
				+	dm_table_put(map);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+int dm_underlying_device_busy(struct request_queue *q)
			
 
				+{
			
 
				+	return blk_lld_busy(q);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
			
 
				+
			
 
				+static int dm_lld_busy(struct request_queue *q)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct mapped_device *md = q->queuedata;
			
 
				+	struct dm_table *map = dm_get_table(md);
			
 
				+
			
 
				+	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
			
 
				+		r = 1;
			
 
				+	else
			
 
				+		r = dm_table_any_busy_target(map);
			
 
				+
			
 
				+	dm_table_put(map);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				 static void dm_unplug_all(struct request_queue *q)
			
 
				 {
			
 
				 	struct mapped_device *md = q->queuedata;
			
 
				 	struct dm_table *map = dm_get_table(md);
			
 
				 
			
 
				 	if (map) {
			
 
				+		if (dm_request_based(md))
			
 
				+			generic_unplug_device(q);
			
 
				+
			
 
				 		dm_table_unplug_all(map);
			
 
				 		dm_table_put(map);
			
 
				 	}
			
@@ -1000,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 
				 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
			
 
				 		map = dm_get_table(md);
			
 
				 		if (map) {
			
 
				-			r = dm_table_any_congested(map, bdi_bits);
			
 
				+			/*
			
 
				+			 * Request-based dm cares about only own queue for
			
 
				+			 * the query about congestion status of request_queue
			
 
				+			 */
			
 
				+			if (dm_request_based(md))
			
 
				+				r = md->queue->backing_dev_info.state &
			
 
				+				    bdi_bits;
			
 
				+			else
			
 
				+				r = dm_table_any_congested(map, bdi_bits);
			
 
				+
			
 
				 			dm_table_put(map);
			
 
				 		}
			
 
				 	}
			
@@ -1123,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor)
 
				 	INIT_LIST_HEAD(&md->uevent_list);
			
 
				 	spin_lock_init(&md->uevent_lock);
			
 
				 
			
 
				-	md->queue = blk_alloc_queue(GFP_KERNEL);
			
 
				+	md->queue = blk_init_queue(dm_request_fn, NULL);
			
 
				 	if (!md->queue)
			
 
				 		goto bad_queue;
			
 
				 
			
 
				+	/*
			
 
				+	 * Request-based dm devices cannot be stacked on top of bio-based dm
			
 
				+	 * devices.  The type of this dm device has not been decided yet,
			
 
				+	 * although we initialized the queue using blk_init_queue().
			
 
				+	 * The type is decided at the first table loading time.
			
 
				+	 * To prevent problematic device stacking, clear the queue flag
			
 
				+	 * for request stacking support until then.
			
 
				+	 *
			
 
				+	 * This queue is new, so no concurrency on the queue_flags.
			
 
				+	 */
			
 
				+	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
			
 
				+	md->saved_make_request_fn = md->queue->make_request_fn;
			
 
				 	md->queue->queuedata = md;
			
 
				 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
			
 
				 	md->queue->backing_dev_info.congested_data = md;
			
 
				 	blk_queue_make_request(md->queue, dm_request);
			
 
				-	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
			
 
				 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
			
 
				 	md->queue->unplug_fn = dm_unplug_all;
			
 
				 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
			
 
				-
			
 
				-	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
			
 
				-	if (!md->io_pool)
			
 
				-		goto bad_io_pool;
			
 
				-
			
 
				-	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
			
 
				-	if (!md->tio_pool)
			
 
				-		goto bad_tio_pool;
			
 
				-
			
 
				-	md->bs = bioset_create(16, 0);
			
 
				-	if (!md->bs)
			
 
				-		goto bad_no_bioset;
			
 
				+	blk_queue_softirq_done(md->queue, dm_softirq_done);
			
 
				+	blk_queue_prep_rq(md->queue, dm_prep_fn);
			
 
				+	blk_queue_lld_busy(md->queue, dm_lld_busy);
			
 
				 
			
 
				 	md->disk = alloc_disk(1);
			
 
				 	if (!md->disk)
			
@@ -1170,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor)
 
				 	if (!md->wq)
			
 
				 		goto bad_thread;
			
 
				 
			
 
				+	md->bdev = bdget_disk(md->disk, 0);
			
 
				+	if (!md->bdev)
			
 
				+		goto bad_bdev;
			
 
				+
			
 
				 	/* Populate the mapping, nobody knows we exist yet */
			
 
				 	spin_lock(&_minor_lock);
			
 
				 	old_md = idr_replace(&_minor_idr, md, minor);
			
@@ -1179,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor)
 
				 
			
 
				 	return md;
			
 
				 
			
 
				+bad_bdev:
			
 
				+	destroy_workqueue(md->wq);
			
 
				 bad_thread:
			
 
				 	put_disk(md->disk);
			
 
				 bad_disk:
			
 
				-	bioset_free(md->bs);
			
 
				-bad_no_bioset:
			
 
				-	mempool_destroy(md->tio_pool);
			
 
				-bad_tio_pool:
			
 
				-	mempool_destroy(md->io_pool);
			
 
				-bad_io_pool:
			
 
				 	blk_cleanup_queue(md->queue);
			
 
				 bad_queue:
			
 
				 	free_minor(minor);
			
@@ -1204,14 +1832,15 @@ static void free_dev(struct mapped_device *md)
 
				 {
			
 
				 	int minor = MINOR(disk_devt(md->disk));
			
 
				 
			
 
				-	if (md->suspended_bdev) {
			
 
				-		unlock_fs(md);
			
 
				-		bdput(md->suspended_bdev);
			
 
				-	}
			
 
				+	unlock_fs(md);
			
 
				+	bdput(md->bdev);
			
 
				 	destroy_workqueue(md->wq);
			
 
				-	mempool_destroy(md->tio_pool);
			
 
				-	mempool_destroy(md->io_pool);
			
 
				-	bioset_free(md->bs);
			
 
				+	if (md->tio_pool)
			
 
				+		mempool_destroy(md->tio_pool);
			
 
				+	if (md->io_pool)
			
 
				+		mempool_destroy(md->io_pool);
			
 
				+	if (md->bs)
			
 
				+		bioset_free(md->bs);
			
 
				 	blk_integrity_unregister(md->disk);
			
 
				 	del_gendisk(md->disk);
			
 
				 	free_minor(minor);
			
@@ -1226,6 +1855,29 @@ static void free_dev(struct mapped_device *md)
 
				 	kfree(md);
			
 
				 }
			
 
				 
			
 
				+static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
			
 
				+{
			
 
				+	struct dm_md_mempools *p;
			
 
				+
			
 
				+	if (md->io_pool && md->tio_pool && md->bs)
			
 
				+		/* the md already has necessary mempools */
			
 
				+		goto out;
			
 
				+
			
 
				+	p = dm_table_get_md_mempools(t);
			
 
				+	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
			
 
				+
			
 
				+	md->io_pool = p->io_pool;
			
 
				+	p->io_pool = NULL;
			
 
				+	md->tio_pool = p->tio_pool;
			
 
				+	p->tio_pool = NULL;
			
 
				+	md->bs = p->bs;
			
 
				+	p->bs = NULL;
			
 
				+
			
 
				+out:
			
 
				+	/* mempool bind completed, now no need any mempools in the table */
			
 
				+	dm_table_free_md_mempools(t);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Bind a table to the device.
			
 
				  */
			
@@ -1249,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size)
 
				 {
			
 
				 	set_capacity(md->disk, size);
			
 
				 
			
 
				-	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
			
 
				-	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
			
 
				-	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
			
 
				+	mutex_lock(&md->bdev->bd_inode->i_mutex);
			
 
				+	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
			
 
				+	mutex_unlock(&md->bdev->bd_inode->i_mutex);
			
 
				 }
			
 
				 
			
 
				-static int __bind(struct mapped_device *md, struct dm_table *t)
			
 
				+static int __bind(struct mapped_device *md, struct dm_table *t,
			
 
				+		  struct queue_limits *limits)
			
 
				 {
			
 
				 	struct request_queue *q = md->queue;
			
 
				 	sector_t size;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	size = dm_table_get_size(t);
			
 
				 
			
@@ -1267,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
				 	if (size != get_capacity(md->disk))
			
 
				 		memset(&md->geometry, 0, sizeof(md->geometry));
			
 
				 
			
 
				-	if (md->suspended_bdev)
			
 
				-		__set_size(md, size);
			
 
				+	__set_size(md, size);
			
 
				 
			
 
				 	if (!size) {
			
 
				 		dm_table_destroy(t);
			
@@ -1277,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
				 
			
 
				 	dm_table_event_callback(t, event_callback, md);
			
 
				 
			
 
				-	write_lock(&md->map_lock);
			
 
				+	/*
			
 
				+	 * The queue hasn't been stopped yet, if the old table type wasn't
			
 
				+	 * for request-based during suspension.  So stop it to prevent
			
 
				+	 * I/O mapping before resume.
			
 
				+	 * This must be done before setting the queue restrictions,
			
 
				+	 * because request-based dm may be run just after the setting.
			
 
				+	 */
			
 
				+	if (dm_table_request_based(t) && !blk_queue_stopped(q))
			
 
				+		stop_queue(q);
			
 
				+
			
 
				+	__bind_mempools(md, t);
			
 
				+
			
 
				+	write_lock_irqsave(&md->map_lock, flags);
			
 
				 	md->map = t;
			
 
				-	dm_table_set_restrictions(t, q);
			
 
				-	write_unlock(&md->map_lock);
			
 
				+	dm_table_set_restrictions(t, q, limits);
			
 
				+	write_unlock_irqrestore(&md->map_lock, flags);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -1288,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
				 static void __unbind(struct mapped_device *md)
			
 
				 {
			
 
				 	struct dm_table *map = md->map;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	if (!map)
			
 
				 		return;
			
 
				 
			
 
				 	dm_table_event_callback(map, NULL, NULL);
			
 
				-	write_lock(&md->map_lock);
			
 
				+	write_lock_irqsave(&md->map_lock, flags);
			
 
				 	md->map = NULL;
			
 
				-	write_unlock(&md->map_lock);
			
 
				+	write_unlock_irqrestore(&md->map_lock, flags);
			
 
				 	dm_table_destroy(map);
			
 
				 }
			
 
				 
			
@@ -1399,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 
				 {
			
 
				 	int r = 0;
			
 
				 	DECLARE_WAITQUEUE(wait, current);
			
 
				+	struct request_queue *q = md->queue;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	dm_unplug_all(md->queue);
			
 
				 
			
@@ -1408,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 
				 		set_current_state(interruptible);
			
 
				 
			
 
				 		smp_mb();
			
 
				-		if (!atomic_read(&md->pending))
			
 
				+		if (dm_request_based(md)) {
			
 
				+			spin_lock_irqsave(q->queue_lock, flags);
			
 
				+			if (!queue_in_flight(q) && blk_queue_stopped(q)) {
			
 
				+				spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+				break;
			
 
				+			}
			
 
				+			spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+		} else if (!atomic_read(&md->pending))
			
 
				 			break;
			
 
				 
			
 
				 		if (interruptible == TASK_INTERRUPTIBLE &&
			
@@ -1426,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-static int dm_flush(struct mapped_device *md)
			
 
				+static void dm_flush(struct mapped_device *md)
			
 
				 {
			
 
				 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
			
 
				-	return 0;
			
 
				+
			
 
				+	bio_init(&md->barrier_bio);
			
 
				+	md->barrier_bio.bi_bdev = md->bdev;
			
 
				+	md->barrier_bio.bi_rw = WRITE_BARRIER;
			
 
				+	__split_and_process_bio(md, &md->barrier_bio);
			
 
				+
			
 
				+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
			
 
				 }
			
 
				 
			
 
				 static void process_barrier(struct mapped_device *md, struct bio *bio)
			
 
				 {
			
 
				-	int error = dm_flush(md);
			
 
				-
			
 
				-	if (unlikely(error)) {
			
 
				-		bio_endio(bio, error);
			
 
				-		return;
			
 
				-	}
			
 
				-	if (bio_empty_barrier(bio)) {
			
 
				-		bio_endio(bio, 0);
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	__split_and_process_bio(md, bio);
			
 
				+	md->barrier_error = 0;
			
 
				 
			
 
				-	error = dm_flush(md);
			
 
				+	dm_flush(md);
			
 
				 
			
 
				-	if (!error && md->barrier_error)
			
 
				-		error = md->barrier_error;
			
 
				+	if (!bio_empty_barrier(bio)) {
			
 
				+		__split_and_process_bio(md, bio);
			
 
				+		dm_flush(md);
			
 
				+	}
			
 
				 
			
 
				 	if (md->barrier_error != DM_ENDIO_REQUEUE)
			
 
				-		bio_endio(bio, error);
			
 
				+		bio_endio(bio, md->barrier_error);
			
 
				+	else {
			
 
				+		spin_lock_irq(&md->deferred_lock);
			
 
				+		bio_list_add_head(&md->deferred, bio);
			
 
				+		spin_unlock_irq(&md->deferred_lock);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1479,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work)
 
				 
			
 
				 		up_write(&md->io_lock);
			
 
				 
			
 
				-		if (bio_barrier(c))
			
 
				-			process_barrier(md, c);
			
 
				-		else
			
 
				-			__split_and_process_bio(md, c);
			
 
				+		if (dm_request_based(md))
			
 
				+			generic_make_request(c);
			
 
				+		else {
			
 
				+			if (bio_barrier(c))
			
 
				+				process_barrier(md, c);
			
 
				+			else
			
 
				+				__split_and_process_bio(md, c);
			
 
				+		}
			
 
				 
			
 
				 		down_write(&md->io_lock);
			
 
				 	}
			
@@ -1502,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md)
 
				  */
			
 
				 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
			
 
				 {
			
 
				+	struct queue_limits limits;
			
 
				 	int r = -EINVAL;
			
 
				 
			
 
				 	mutex_lock(&md->suspend_lock);
			
@@ -1510,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 
				 	if (!dm_suspended(md))
			
 
				 		goto out;
			
 
				 
			
 
				-	/* without bdev, the device size cannot be changed */
			
 
				-	if (!md->suspended_bdev)
			
 
				-		if (get_capacity(md->disk) != dm_table_get_size(table))
			
 
				-			goto out;
			
 
				+	r = dm_calculate_queue_limits(table, &limits);
			
 
				+	if (r)
			
 
				+		goto out;
			
 
				+
			
 
				+	/* cannot change the device type, once a table is bound */
			
 
				+	if (md->map &&
			
 
				+	    (dm_table_get_type(md->map) != dm_table_get_type(table))) {
			
 
				+		DMWARN("can't change the device type after a table is bound");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * It is enought that blk_queue_ordered() is called only once when
			
 
				+	 * the first bio-based table is bound.
			
 
				+	 *
			
 
				+	 * This setting should be moved to alloc_dev() when request-based dm
			
 
				+	 * supports barrier.
			
 
				+	 */
			
 
				+	if (!md->map && dm_table_bio_based(table))
			
 
				+		blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
			
 
				 
			
 
				 	__unbind(md);
			
 
				-	r = __bind(md, table);
			
 
				+	r = __bind(md, table, &limits);
			
 
				 
			
 
				 out:
			
 
				 	mutex_unlock(&md->suspend_lock);
			
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
			
 
				+{
			
 
				+	md->suspend_rq.special = (void *)0x1;
			
 
				+}
			
 
				+
			
 
				+static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
			
 
				+{
			
 
				+	struct request_queue *q = md->queue;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(q->queue_lock, flags);
			
 
				+	if (!noflush)
			
 
				+		dm_rq_invalidate_suspend_marker(md);
			
 
				+	__start_queue(q);
			
 
				+	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
			
 
				+{
			
 
				+	struct request *rq = &md->suspend_rq;
			
 
				+	struct request_queue *q = md->queue;
			
 
				+
			
 
				+	if (noflush)
			
 
				+		stop_queue(q);
			
 
				+	else {
			
 
				+		blk_rq_init(q, rq);
			
 
				+		blk_insert_request(q, rq, 0, NULL);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
			
 
				+{
			
 
				+	int r = 1;
			
 
				+	struct request *rq = &md->suspend_rq;
			
 
				+	struct request_queue *q = md->queue;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (noflush)
			
 
				+		return r;
			
 
				+
			
 
				+	/* The marker must be protected by queue lock if it is in use */
			
 
				+	spin_lock_irqsave(q->queue_lock, flags);
			
 
				+	if (unlikely(rq->ref_count)) {
			
 
				+		/*
			
 
				+		 * This can happen, when the previous flush suspend was
			
 
				+		 * interrupted, the marker is still in the queue and
			
 
				+		 * this flush suspend has been invoked, because we don't
			
 
				+		 * remove the marker at the time of suspend interruption.
			
 
				+		 * We have only one marker per mapped_device, so we can't
			
 
				+		 * start another flush suspend while it is in use.
			
 
				+		 */
			
 
				+		BUG_ON(!rq->special); /* The marker should be invalidated */
			
 
				+		DMWARN("Invalidating the previous flush suspend is still in"
			
 
				+		       " progress.  Please retry later.");
			
 
				+		r = 0;
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(q->queue_lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Functions to lock and unlock any filesystem running on the
			
 
				  * device.
			
@@ -1533,7 +2292,7 @@ static int lock_fs(struct mapped_device *md)
 
				 
			
 
				 	WARN_ON(md->frozen_sb);
			
 
				 
			
 
				-	md->frozen_sb = freeze_bdev(md->suspended_bdev);
			
 
				+	md->frozen_sb = freeze_bdev(md->bdev);
			
 
				 	if (IS_ERR(md->frozen_sb)) {
			
 
				 		r = PTR_ERR(md->frozen_sb);
			
 
				 		md->frozen_sb = NULL;
			
@@ -1542,9 +2301,6 @@ static int lock_fs(struct mapped_device *md)
 
				 
			
 
				 	set_bit(DMF_FROZEN, &md->flags);
			
 
				 
			
 
				-	/* don't bdput right now, we don't want the bdev
			
 
				-	 * to go away while it is locked.
			
 
				-	 */
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1553,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md)
 
				 	if (!test_bit(DMF_FROZEN, &md->flags))
			
 
				 		return;
			
 
				 
			
 
				-	thaw_bdev(md->suspended_bdev, md->frozen_sb);
			
 
				+	thaw_bdev(md->bdev, md->frozen_sb);
			
 
				 	md->frozen_sb = NULL;
			
 
				 	clear_bit(DMF_FROZEN, &md->flags);
			
 
				 }
			
@@ -1565,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md)
 
				  * dm_bind_table, dm_suspend must be called to flush any in
			
 
				  * flight bios and ensure that any further io gets deferred.
			
 
				  */
			
 
				+/*
			
 
				+ * Suspend mechanism in request-based dm.
			
 
				+ *
			
 
				+ * After the suspend starts, further incoming requests are kept in
			
 
				+ * the request_queue and deferred.
			
 
				+ * Remaining requests in the request_queue at the start of suspend are flushed
			
 
				+ * if it is flush suspend.
			
 
				+ * The suspend completes when the following conditions have been satisfied,
			
 
				+ * so wait for it:
			
 
				+ *    1. q->in_flight is 0 (which means no in_flight request)
			
 
				+ *    2. queue has been stopped (which means no request dispatching)
			
 
				+ *
			
 
				+ *
			
 
				+ * Noflush suspend
			
 
				+ * ---------------
			
 
				+ * Noflush suspend doesn't need to dispatch remaining requests.
			
 
				+ * So stop the queue immediately.  Then, wait for all in_flight requests
			
 
				+ * to be completed or requeued.
			
 
				+ *
			
 
				+ * To abort noflush suspend, start the queue.
			
 
				+ *
			
 
				+ *
			
 
				+ * Flush suspend
			
 
				+ * -------------
			
 
				+ * Flush suspend needs to dispatch remaining requests.  So stop the queue
			
 
				+ * after the remaining requests are completed. (Requeued request must be also
			
 
				+ * re-dispatched and completed.  Until then, we can't stop the queue.)
			
 
				+ *
			
 
				+ * During flushing the remaining requests, further incoming requests are also
			
 
				+ * inserted to the same queue.  To distinguish which requests are to be
			
 
				+ * flushed, we insert a marker request to the queue at the time of starting
			
 
				+ * flush suspend, like a barrier.
			
 
				+ * The dispatching is blocked when the marker is found on the top of the queue.
			
 
				+ * And the queue is stopped when all in_flight requests are completed, since
			
 
				+ * that means the remaining requests are completely flushed.
			
 
				+ * Then, the marker is removed from the queue.
			
 
				+ *
			
 
				+ * To abort flush suspend, we also need to take care of the marker, not only
			
 
				+ * starting the queue.
			
 
				+ * We don't remove the marker forcibly from the queue since it's against
			
 
				+ * the block-layer manner.  Instead, we put a invalidated mark on the marker.
			
 
				+ * When the invalidated marker is found on the top of the queue, it is
			
 
				+ * immediately removed from the queue, so it doesn't block dispatching.
			
 
				+ * Because we have only one marker per mapped_device, we can't start another
			
 
				+ * flush suspend until the invalidated marker is removed from the queue.
			
 
				+ * So fail and return with -EBUSY in such a case.
			
 
				+ */
			
 
				 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
			
 
				 {
			
 
				 	struct dm_table *map = NULL;
			
@@ -1579,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				+	if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
			
 
				+		r = -EBUSY;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				 	map = dm_get_table(md);
			
 
				 
			
 
				 	/*
			
@@ -1591,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
				 	/* This does not get reverted if there's an error later. */
			
 
				 	dm_table_presuspend_targets(map);
			
 
				 
			
 
				-	/* bdget() can stall if the pending I/Os are not flushed */
			
 
				-	if (!noflush) {
			
 
				-		md->suspended_bdev = bdget_disk(md->disk, 0);
			
 
				-		if (!md->suspended_bdev) {
			
 
				-			DMWARN("bdget failed in dm_suspend");
			
 
				-			r = -ENOMEM;
			
 
				+	/*
			
 
				+	 * Flush I/O to the device. noflush supersedes do_lockfs,
			
 
				+	 * because lock_fs() needs to flush I/Os.
			
 
				+	 */
			
 
				+	if (!noflush && do_lockfs) {
			
 
				+		r = lock_fs(md);
			
 
				+		if (r)
			
 
				 			goto out;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * Flush I/O to the device. noflush supersedes do_lockfs,
			
 
				-		 * because lock_fs() needs to flush I/Os.
			
 
				-		 */
			
 
				-		if (do_lockfs) {
			
 
				-			r = lock_fs(md);
			
 
				-			if (r)
			
 
				-				goto out;
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -1634,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
				 
			
 
				 	flush_workqueue(md->wq);
			
 
				 
			
 
				+	if (dm_request_based(md))
			
 
				+		dm_rq_start_suspend(md, noflush);
			
 
				+
			
 
				 	/*
			
 
				 	 * At this point no more requests are entering target request routines.
			
 
				 	 * We call dm_wait_for_completion to wait for all existing requests
			
@@ -1650,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
				 	if (r < 0) {
			
 
				 		dm_queue_flush(md);
			
 
				 
			
 
				+		if (dm_request_based(md))
			
 
				+			dm_rq_abort_suspend(md, noflush);
			
 
				+
			
 
				 		unlock_fs(md);
			
 
				 		goto out; /* pushback list is already flushed, so skip flush */
			
 
				 	}
			
@@ -1665,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
				 	set_bit(DMF_SUSPENDED, &md->flags);
			
 
				 
			
 
				 out:
			
 
				-	if (r && md->suspended_bdev) {
			
 
				-		bdput(md->suspended_bdev);
			
 
				-		md->suspended_bdev = NULL;
			
 
				-	}
			
 
				-
			
 
				 	dm_table_put(map);
			
 
				 
			
 
				 out_unlock:
			
@@ -1696,21 +2495,20 @@ int dm_resume(struct mapped_device *md)
 
				 
			
 
				 	dm_queue_flush(md);
			
 
				 
			
 
				-	unlock_fs(md);
			
 
				+	/*
			
 
				+	 * Flushing deferred I/Os must be done after targets are resumed
			
 
				+	 * so that mapping of targets can work correctly.
			
 
				+	 * Request-based dm is queueing the deferred I/Os in its request_queue.
			
 
				+	 */
			
 
				+	if (dm_request_based(md))
			
 
				+		start_queue(md->queue);
			
 
				 
			
 
				-	if (md->suspended_bdev) {
			
 
				-		bdput(md->suspended_bdev);
			
 
				-		md->suspended_bdev = NULL;
			
 
				-	}
			
 
				+	unlock_fs(md);
			
 
				 
			
 
				 	clear_bit(DMF_SUSPENDED, &md->flags);
			
 
				 
			
 
				 	dm_table_unplug_all(map);
			
 
				-
			
 
				-	dm_kobject_uevent(md);
			
 
				-
			
 
				 	r = 0;
			
 
				-
			
 
				 out:
			
 
				 	dm_table_put(map);
			
 
				 	mutex_unlock(&md->suspend_lock);
			
@@ -1721,9 +2519,19 @@ int dm_resume(struct mapped_device *md)
 
				 /*-----------------------------------------------------------------
			
 
				  * Event notification.
			
 
				  *---------------------------------------------------------------*/
			
 
				-void dm_kobject_uevent(struct mapped_device *md)
			
 
				-{
			
 
				-	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
			
 
				+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
			
 
				+		       unsigned cookie)
			
 
				+{
			
 
				+	char udev_cookie[DM_COOKIE_LENGTH];
			
 
				+	char *envp[] = { udev_cookie, NULL };
			
 
				+
			
 
				+	if (!cookie)
			
 
				+		kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
			
 
				+	else {
			
 
				+		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
			
 
				+			 DM_COOKIE_ENV_VAR_NAME, cookie);
			
 
				+		kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 uint32_t dm_next_uevent_seq(struct mapped_device *md)
			
@@ -1777,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 
				 	if (&md->kobj != kobj)
			
 
				 		return NULL;
			
 
				 
			
 
				+	if (test_bit(DMF_FREEING, &md->flags) ||
			
 
				+	    test_bit(DMF_DELETING, &md->flags))
			
 
				+		return NULL;
			
 
				+
			
 
				 	dm_get(md);
			
 
				 	return md;
			
 
				 }
			
@@ -1797,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
			
 
				 
			
 
				+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
			
 
				+{
			
 
				+	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
			
 
				+
			
 
				+	if (!pools)
			
 
				+		return NULL;
			
 
				+
			
 
				+	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
			
 
				+			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
			
 
				+			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
			
 
				+	if (!pools->io_pool)
			
 
				+		goto free_pools_and_out;
			
 
				+
			
 
				+	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
			
 
				+			  mempool_create_slab_pool(MIN_IOS, _tio_cache) :
			
 
				+			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
			
 
				+	if (!pools->tio_pool)
			
 
				+		goto free_io_pool_and_out;
			
 
				+
			
 
				+	pools->bs = (type == DM_TYPE_BIO_BASED) ?
			
 
				+		    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
			
 
				+	if (!pools->bs)
			
 
				+		goto free_tio_pool_and_out;
			
 
				+
			
 
				+	return pools;
			
 
				+
			
 
				+free_tio_pool_and_out:
			
 
				+	mempool_destroy(pools->tio_pool);
			
 
				+
			
 
				+free_io_pool_and_out:
			
 
				+	mempool_destroy(pools->io_pool);
			
 
				+
			
 
				+free_pools_and_out:
			
 
				+	kfree(pools);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void dm_free_md_mempools(struct dm_md_mempools *pools)
			
 
				+{
			
 
				+	if (!pools)
			
 
				+		return;
			
 
				+
			
 
				+	if (pools->io_pool)
			
 
				+		mempool_destroy(pools->io_pool);
			
 
				+
			
 
				+	if (pools->tio_pool)
			
 
				+		mempool_destroy(pools->tio_pool);
			
 
				+
			
 
				+	if (pools->bs)
			
 
				+		bioset_free(pools->bs);
			
 
				+
			
 
				+	kfree(pools);
			
 
				+}
			
 
				+
			
 
				 static struct block_device_operations dm_blk_dops = {
			
 
				 	.open = dm_blk_open,
			
 
				 	.release = dm_blk_close,
			
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -22,6 +22,13 @@
 
				 #define DM_SUSPEND_LOCKFS_FLAG		(1 << 0)
			
 
				 #define DM_SUSPEND_NOFLUSH_FLAG		(1 << 1)
			
 
				 
			
 
				+/*
			
 
				+ * Type of table and mapped_device's mempool
			
 
				+ */
			
 
				+#define DM_TYPE_NONE		0
			
 
				+#define DM_TYPE_BIO_BASED	1
			
 
				+#define DM_TYPE_REQUEST_BASED	2
			
 
				+
			
 
				 /*
			
 
				  * List of devices that a metadevice uses and should open/close.
			
 
				  */
			
@@ -32,6 +39,7 @@ struct dm_dev_internal {
 
				 };
			
 
				 
			
 
				 struct dm_table;
			
 
				+struct dm_md_mempools;
			
 
				 
			
 
				 /*-----------------------------------------------------------------
			
 
				  * Internal table functions.
			
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t,
 
				 			     void (*fn)(void *), void *context);
			
 
				 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
			
 
				 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
			
 
				-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
			
 
				+int dm_calculate_queue_limits(struct dm_table *table,
			
 
				+			      struct queue_limits *limits);
			
 
				+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
			
 
				+			       struct queue_limits *limits);
			
 
				 struct list_head *dm_table_get_devices(struct dm_table *t);
			
 
				 void dm_table_presuspend_targets(struct dm_table *t);
			
 
				 void dm_table_postsuspend_targets(struct dm_table *t);
			
 
				 int dm_table_resume_targets(struct dm_table *t);
			
 
				 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
			
 
				+int dm_table_any_busy_target(struct dm_table *t);
			
 
				+int dm_table_set_type(struct dm_table *t);
			
 
				+unsigned dm_table_get_type(struct dm_table *t);
			
 
				+bool dm_table_bio_based(struct dm_table *t);
			
 
				+bool dm_table_request_based(struct dm_table *t);
			
 
				+int dm_table_alloc_md_mempools(struct dm_table *t);
			
 
				+void dm_table_free_md_mempools(struct dm_table *t);
			
 
				+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
			
 
				 
			
 
				 /*
			
 
				  * To check the return value from dm_table_find_target().
			
 
				  */
			
 
				 #define dm_target_is_valid(t) ((t)->table)
			
 
				 
			
 
				+/*
			
 
				+ * To check whether the target type is request-based or not (bio-based).
			
 
				+ */
			
 
				+#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
			
 
				+
			
 
				 /*-----------------------------------------------------------------
			
 
				  * A registry of target types.
			
 
				  *---------------------------------------------------------------*/
			
@@ -92,9 +116,16 @@ void dm_stripe_exit(void);
 
				 int dm_open_count(struct mapped_device *md);
			
 
				 int dm_lock_for_deletion(struct mapped_device *md);
			
 
				 
			
 
				-void dm_kobject_uevent(struct mapped_device *md);
			
 
				+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
			
 
				+		       unsigned cookie);
			
 
				 
			
 
				 int dm_kcopyd_init(void);
			
 
				 void dm_kcopyd_exit(void);
			
 
				 
			
 
				+/*
			
 
				+ * Mempool operations
			
 
				+ */
			
 
				+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
			
 
				+void dm_free_md_mempools(struct dm_md_mempools *pools);
			
 
				+
			
 
				 #endif
			
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -57,6 +57,7 @@ header-y += dlmconstants.h
 
				 header-y += dlm_device.h
			
 
				 header-y += dlm_netlink.h
			
 
				 header-y += dm-ioctl.h
			
 
				+header-y += dm-log-userspace.h
			
 
				 header-y += dn.h
			
 
				 header-y += dqblk_xfs.h
			
 
				 header-y += efs_fs_sb.h
			
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -41,8 +41,10 @@
 
				 #define CN_IDX_BB			0x5	/* BlackBoard, from the TSP GPL sampling framework */
			
 
				 #define CN_DST_IDX			0x6
			
 
				 #define CN_DST_VAL			0x1
			
 
				+#define CN_IDX_DM			0x7	/* Device Mapper */
			
 
				+#define CN_VAL_DM_USERSPACE_LOG		0x1
			
 
				 
			
 
				-#define CN_NETLINK_USERS		7
			
 
				+#define CN_NETLINK_USERS		8
			
 
				 
			
 
				 /*
			
 
				  * Maximum connector's message size.
			
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -11,6 +11,7 @@
 
				 #include <linux/bio.h>
			
 
				 #include <linux/blkdev.h>
			
 
				 
			
 
				+struct dm_dev;
			
 
				 struct dm_target;
			
 
				 struct dm_table;
			
 
				 struct mapped_device;
			
@@ -21,6 +22,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
				 union map_info {
			
 
				 	void *ptr;
			
 
				 	unsigned long long ll;
			
 
				+	unsigned flush_request;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -80,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 
				 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
			
 
				 			    struct bio_vec *biovec, int max_size);
			
 
				 
			
 
				+typedef int (*iterate_devices_callout_fn) (struct dm_target *ti,
			
 
				+					   struct dm_dev *dev,
			
 
				+					   sector_t physical_start,
			
 
				+					   void *data);
			
 
				+
			
 
				+typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
			
 
				+				      iterate_devices_callout_fn fn,
			
 
				+				      void *data);
			
 
				+
			
 
				 /*
			
 
				  * Returns:
			
 
				  *    0: The target can handle the next I/O immediately.
			
@@ -92,7 +103,8 @@ void dm_error(const char *message);
 
				 /*
			
 
				  * Combine device limits.
			
 
				  */
			
 
				-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev);
			
 
				+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
			
 
				+			 sector_t start, void *data);
			
 
				 
			
 
				 struct dm_dev {
			
 
				 	struct block_device *bdev;
			
@@ -138,23 +150,12 @@ struct target_type {
 
				 	dm_ioctl_fn ioctl;
			
 
				 	dm_merge_fn merge;
			
 
				 	dm_busy_fn busy;
			
 
				+	dm_iterate_devices_fn iterate_devices;
			
 
				 
			
 
				 	/* For internal device-mapper use. */
			
 
				 	struct list_head list;
			
 
				 };
			
 
				 
			
 
				-struct io_restrictions {
			
 
				-	unsigned long bounce_pfn;
			
 
				-	unsigned long seg_boundary_mask;
			
 
				-	unsigned max_hw_sectors;
			
 
				-	unsigned max_sectors;
			
 
				-	unsigned max_segment_size;
			
 
				-	unsigned short logical_block_size;
			
 
				-	unsigned short max_hw_segments;
			
 
				-	unsigned short max_phys_segments;
			
 
				-	unsigned char no_cluster; /* inverted so that 0 is default */
			
 
				-};
			
 
				-
			
 
				 struct dm_target {
			
 
				 	struct dm_table *table;
			
 
				 	struct target_type *type;
			
@@ -163,15 +164,18 @@ struct dm_target {
 
				 	sector_t begin;
			
 
				 	sector_t len;
			
 
				 
			
 
				-	/* FIXME: turn this into a mask, and merge with io_restrictions */
			
 
				 	/* Always a power of 2 */
			
 
				 	sector_t split_io;
			
 
				 
			
 
				 	/*
			
 
				-	 * These are automatically filled in by
			
 
				-	 * dm_table_get_device.
			
 
				+	 * A number of zero-length barrier requests that will be submitted
			
 
				+	 * to the target for the purpose of flushing cache.
			
 
				+	 *
			
 
				+	 * The request number will be placed in union map_info->flush_request.
			
 
				+	 * It is a responsibility of the target driver to remap these requests
			
 
				+	 * to the real underlying devices.
			
 
				 	 */
			
 
				-	struct io_restrictions limits;
			
 
				+	unsigned num_flush_requests;
			
 
				 
			
 
				 	/* target specific data */
			
 
				 	void *private;
			
@@ -230,6 +234,7 @@ struct gendisk *dm_disk(struct mapped_device *md);
 
				 int dm_suspended(struct mapped_device *md);
			
 
				 int dm_noflush_suspending(struct dm_target *ti);
			
 
				 union map_info *dm_get_mapinfo(struct bio *bio);
			
 
				+union map_info *dm_get_rq_mapinfo(struct request *rq);
			
 
				 
			
 
				 /*
			
 
				  * Geometry functions.
			
@@ -392,4 +397,12 @@ static inline unsigned long to_bytes(sector_t n)
 
				 	return (n << SECTOR_SHIFT);
			
 
				 }
			
 
				 
			
 
				+/*-----------------------------------------------------------------
			
 
				+ * Helper for block layer and dm core operations
			
 
				+ *---------------------------------------------------------------*/
			
 
				+void dm_dispatch_request(struct request *rq);
			
 
				+void dm_requeue_unmapped_request(struct request *rq);
			
 
				+void dm_kill_unmapped_request(struct request *rq, int error);
			
 
				+int dm_underlying_device_busy(struct request_queue *q);
			
 
				+
			
 
				 #endif	/* _LINUX_DEVICE_MAPPER_H */
			
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -123,6 +123,16 @@ struct dm_ioctl {
 
				 	__u32 target_count;	/* in/out */
			
 
				 	__s32 open_count;	/* out */
			
 
				 	__u32 flags;		/* in/out */
			
 
				+
			
 
				+	/*
			
 
				+	 * event_nr holds either the event number (input and output) or the
			
 
				+	 * udev cookie value (input only).
			
 
				+	 * The DM_DEV_WAIT ioctl takes an event number as input.
			
 
				+	 * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
			
 
				+	 * use the field as a cookie to return in the DM_COOKIE
			
 
				+	 * variable with the uevents they issue.
			
 
				+	 * For output, the ioctls return the event number, not the cookie.
			
 
				+	 */
			
 
				 	__u32 event_nr;      	/* in/out */
			
 
				 	__u32 padding;
			
 
				 
			
@@ -256,9 +266,9 @@ enum {
 
				 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
			
 
				 
			
 
				 #define DM_VERSION_MAJOR	4
			
 
				-#define DM_VERSION_MINOR	14
			
 
				+#define DM_VERSION_MINOR	15
			
 
				 #define DM_VERSION_PATCHLEVEL	0
			
 
				-#define DM_VERSION_EXTRA	"-ioctl (2008-04-23)"
			
 
				+#define DM_VERSION_EXTRA	"-ioctl (2009-04-01)"
			
 
				 
			
 
				 /* Status bits */
			
 
				 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
			
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -0,0 +1,386 @@
 
				+/*
			
 
				+ * Copyright (C) 2006-2009 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the LGPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DM_LOG_USERSPACE_H__
			
 
				+#define __DM_LOG_USERSPACE_H__
			
 
				+
			
 
				+#include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
			
 
				+
			
 
				+/*
			
 
				+ * The device-mapper userspace log module consists of a kernel component and
			
 
				+ * a user-space component.  The kernel component implements the API defined
			
 
				+ * in dm-dirty-log.h.  Its purpose is simply to pass the parameters and
			
 
				+ * return values of those API functions between kernel and user-space.
			
 
				+ *
			
 
				+ * Below are defined the 'request_types' - DM_ULOG_CTR, DM_ULOG_DTR, etc.
			
 
				+ * These request types represent the different functions in the device-mapper
			
 
				+ * dirty log API.  Each of these is described in more detail below.
			
 
				+ *
			
 
				+ * The user-space program must listen for requests from the kernel (representing
			
 
				+ * the various API functions) and process them.
			
 
				+ *
			
 
				+ * User-space begins by setting up the communication link (error checking
			
 
				+ * removed for clarity):
			
 
				+ *	fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
			
 
				+ *	addr.nl_family = AF_NETLINK;
			
 
				+ *	addr.nl_groups = CN_IDX_DM;
			
 
				+ *	addr.nl_pid = 0;
			
 
				+ *	r = bind(fd, (struct sockaddr *) &addr, sizeof(addr));
			
 
				+ *	opt = addr.nl_groups;
			
 
				+ *	setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &opt, sizeof(opt));
			
 
				+ *
			
 
				+ * User-space will then wait to receive requests form the kernel, which it
			
 
				+ * will process as described below.  The requests are received in the form,
			
 
				+ * ((struct dm_ulog_request) + (additional data)).  Depending on the request
			
 
				+ * type, there may or may not be 'additional data'.  In the descriptions below,
			
 
				+ * you will see 'Payload-to-userspace' and 'Payload-to-kernel'.  The
			
 
				+ * 'Payload-to-userspace' is what the kernel sends in 'additional data' as
			
 
				+ * necessary parameters to complete the request.  The 'Payload-to-kernel' is
			
 
				+ * the 'additional data' returned to the kernel that contains the necessary
			
 
				+ * results of the request.  The 'data_size' field in the dm_ulog_request
			
 
				+ * structure denotes the availability and amount of payload data.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_CTR corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
			
 
				+ *	      unsigned argc, char **argv);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	A single string containing all the argv arguments separated by ' 's
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
			
 
				+ *
			
 
				+ * The UUID contained in the dm_ulog_request structure is the reference that
			
 
				+ * will be used by all request types to a specific log.  The constructor must
			
 
				+ * record this assotiation with instance created.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_CTR                    1
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_DTR corresponds to (found in dm-dirty-log.h):
			
 
				+ * void (*dtr)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	A single string containing all the argv arguments separated by ' 's
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
			
 
				+ *
			
 
				+ * The UUID contained in the dm_ulog_request structure is all that is
			
 
				+ * necessary to identify the log instance being destroyed.  There is no
			
 
				+ * payload data.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_DTR                    2
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_PRESUSPEND corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*presuspend)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * The UUID contained in the dm_ulog_request structure is all that is
			
 
				+ * necessary to identify the log instance being presuspended.  There is no
			
 
				+ * payload data.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_PRESUSPEND             3
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_POSTSUSPEND corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*postsuspend)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * The UUID contained in the dm_ulog_request structure is all that is
			
 
				+ * necessary to identify the log instance being postsuspended.  There is no
			
 
				+ * payload data.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_POSTSUSPEND            4
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_RESUME corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*resume)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * The UUID contained in the dm_ulog_request structure is all that is
			
 
				+ * necessary to identify the log instance being resumed.  There is no
			
 
				+ * payload data.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_RESUME                 5
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h):
			
 
				+ * uint32_t (*get_region_size)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	uint64_t - contains the region size
			
 
				+ *
			
 
				+ * The region size is something that was determined at constructor time.
			
 
				+ * It is returned in the payload area and 'data_size' is set to
			
 
				+ * reflect this.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_GET_REGION_SIZE        6
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_IS_CLEAN corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*is_clean)(struct dm_dirty_log *log, region_t region);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	uint64_t - the region to get clean status on
			
 
				+ * Payload-to-kernel:
			
 
				+ *	int64_t  - 1 if clean, 0 otherwise
			
 
				+ *
			
 
				+ * Payload is sizeof(uint64_t) and contains the region for which the clean
			
 
				+ * status is being made.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - filling the payload with 0 (not clean) or
			
 
				+ * 1 (clean), setting 'data_size' and 'error' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_IS_CLEAN               7
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_IN_SYNC corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*in_sync)(struct dm_dirty_log *log, region_t region,
			
 
				+ *		  int can_block);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	uint64_t - the region to get sync status on
			
 
				+ * Payload-to-kernel:
			
 
				+ *	int64_t - 1 if in-sync, 0 otherwise
			
 
				+ *
			
 
				+ * Exactly the same as 'is_clean' above, except this time asking "has the
			
 
				+ * region been recovered?" vs. "is the region not being modified?"
			
 
				+ */
			
 
				+#define DM_ULOG_IN_SYNC                8
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_FLUSH corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*flush)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * No incoming or outgoing payload.  Simply flush log state to disk.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_FLUSH                  9
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_MARK_REGION corresponds to (found in dm-dirty-log.h):
			
 
				+ * void (*mark_region)(struct dm_dirty_log *log, region_t region);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	uint64_t [] - region(s) to mark
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * Incoming payload contains the one or more regions to mark dirty.
			
 
				+ * The number of regions contained in the payload can be determined from
			
 
				+ * 'data_size/sizeof(uint64_t)'.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_MARK_REGION           10
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_CLEAR_REGION corresponds to (found in dm-dirty-log.h):
			
 
				+ * void (*clear_region)(struct dm_dirty_log *log, region_t region);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	uint64_t [] - region(s) to clear
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * Incoming payload contains the one or more regions to mark clean.
			
 
				+ * The number of regions contained in the payload can be determined from
			
 
				+ * 'data_size/sizeof(uint64_t)'.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_CLEAR_REGION          11
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_GET_RESYNC_WORK corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*get_resync_work)(struct dm_dirty_log *log, region_t *region);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	{
			
 
				+ *		int64_t i; -- 1 if recovery necessary, 0 otherwise
			
 
				+ *		uint64_t r; -- The region to recover if i=1
			
 
				+ *	}
			
 
				+ * 'data_size' should be set appropriately.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_GET_RESYNC_WORK       12
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_SET_REGION_SYNC corresponds to (found in dm-dirty-log.h):
			
 
				+ * void (*set_region_sync)(struct dm_dirty_log *log,
			
 
				+ *			   region_t region, int in_sync);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	{
			
 
				+ *		uint64_t - region to set sync state on
			
 
				+ *		int64_t  - 0 if not-in-sync, 1 if in-sync
			
 
				+ *	}
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_SET_REGION_SYNC       13
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_GET_SYNC_COUNT corresponds to (found in dm-dirty-log.h):
			
 
				+ * region_t (*get_sync_count)(struct dm_dirty_log *log);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	uint64_t - the number of in-sync regions
			
 
				+ *
			
 
				+ * No incoming payload.  Kernel-bound payload contains the number of
			
 
				+ * regions that are in-sync (in a size_t).
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_GET_SYNC_COUNT        14
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_STATUS_INFO corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_INFO,
			
 
				+ *		 char *result, unsigned maxlen);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	Character string containing STATUSTYPE_INFO
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_STATUS_INFO           15
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_STATUS_TABLE corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_TABLE,
			
 
				+ *		 char *result, unsigned maxlen);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	None.
			
 
				+ * Payload-to-kernel:
			
 
				+ *	Character string containing STATUSTYPE_TABLE
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_STATUS_TABLE          16
			
 
				+
			
 
				+/*
			
 
				+ * DM_ULOG_IS_REMOTE_RECOVERING corresponds to (found in dm-dirty-log.h):
			
 
				+ * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	uint64_t - region to determine recovery status on
			
 
				+ * Payload-to-kernel:
			
 
				+ *	{
			
 
				+ *		int64_t is_recovering;  -- 0 if no, 1 if yes
			
 
				+ *		uint64_t in_sync_hint;  -- lowest region still needing resync
			
 
				+ *	}
			
 
				+ *
			
 
				+ * When the request has been processed, user-space must return the
			
 
				+ * dm_ulog_request to the kernel - setting the 'error' field and
			
 
				+ * 'data_size' appropriately.
			
 
				+ */
			
 
				+#define DM_ULOG_IS_REMOTE_RECOVERING  17
			
 
				+
			
 
				+/*
			
 
				+ * (DM_ULOG_REQUEST_MASK & request_type) to get the request type
			
 
				+ *
			
 
				+ * Payload-to-userspace:
			
 
				+ *	A single string containing all the argv arguments separated by ' 's
			
 
				+ * Payload-to-kernel:
			
 
				+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
			
 
				+ *
			
 
				+ * We are reserving 8 bits of the 32-bit 'request_type' field for the
			
 
				+ * various request types above.  The remaining 24-bits are currently
			
 
				+ * set to zero and are reserved for future use and compatibility concerns.
			
 
				+ *
			
 
				+ * User-space should always use DM_ULOG_REQUEST_TYPE to aquire the
			
 
				+ * request type from the 'request_type' field to maintain forward compatibility.
			
 
				+ */
			
 
				+#define DM_ULOG_REQUEST_MASK 0xFF
			
 
				+#define DM_ULOG_REQUEST_TYPE(request_type) \
			
 
				+	(DM_ULOG_REQUEST_MASK & (request_type))
			
 
				+
			
 
				+struct dm_ulog_request {
			
 
				+	char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
			
 
				+	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
			
 
				+
			
 
				+	int32_t error;          /* Used to report back processing errors */
			
 
				+
			
 
				+	uint32_t seq;           /* Sequence number for request */
			
 
				+	uint32_t request_type;  /* DM_ULOG_* defined above */
			
 
				+	uint32_t data_size;     /* How much data (not including this struct) */
			
 
				+
			
 
				+	char data[0];
			
 
				+};
			
 
				+
			
 
				+#endif /* __DM_LOG_USERSPACE_H__ */