Pārlūkot izejas kodu

Merge tag 'dm-3.6-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm

Pull device-mapper updates from Alasdair G Kergon:
- Flip the thin target into new read-only or failed modes if errors
  are detected;
- Handle chunk sizes that are not powers of two in the snapshot and
  thin targets;
- Provide a way for userspace to avoid replacing an already-loaded
  multipath hardware handler while booting;
- Reduce dm_thin_endio_hook slab size to avoid allocation failures;
- Numerous small changes and cleanups to the code.

* tag 'dm-3.6-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (63 commits)
  dm thin: commit before gathering status
  dm thin: add read only and fail io modes
  dm thin metadata: introduce dm_pool_abort_metadata
  dm thin metadata: introduce dm_pool_metadata_set_read_only
  dm persistent data: introduce dm_bm_set_read_only
  dm thin: reduce number of metadata commits
  dm thin metadata: add dm_thin_changed_this_transaction
  dm thin metadata: add format option to dm_pool_metadata_open
  dm thin metadata: tidy up open and format error paths
  dm thin metadata: only check incompat features on open
  dm thin metadata: remove duplicate pmd initialisation
  dm thin metadata: remove create parameter from __create_persistent_data_objects
  dm thin metadata: move __superblock_all_zeroes to __open_or_format_metadata
  dm thin metadata: remove nr_blocks arg from __create_persistent_data_objects
  dm thin metadata: split __open or format metadata
  dm thin metadata: use struct dm_pool_metadata members in __open_or_format_metadata
  dm thin metadata: zero unused superblock uuid
  dm thin metadata: lift __begin_transaction out of __write_initial_superblock
  dm thin metadata: move dm_commit_pool_metadata into __write_initial_superblock
  dm thin metadata: factor out __write_initial_superblock
  ...
Linus Torvalds 13 gadi atpakaļ
vecāks
revīzija
7272c30b6f

+ 3 - 4
Documentation/device-mapper/striped.txt

@@ -9,15 +9,14 @@ devices in parallel.
 
 
 Parameters: <num devs> <chunk size> [<dev path> <offset>]+
 Parameters: <num devs> <chunk size> [<dev path> <offset>]+
     <num devs>: Number of underlying devices.
     <num devs>: Number of underlying devices.
-    <chunk size>: Size of each chunk of data. Must be a power-of-2 and at
-                  least as large as the system's PAGE_SIZE.
+    <chunk size>: Size of each chunk of data. Must be at least as
+                  large as the system's PAGE_SIZE.
     <dev path>: Full pathname to the underlying block-device, or a
     <dev path>: Full pathname to the underlying block-device, or a
                 "major:minor" device-number.
                 "major:minor" device-number.
     <offset>: Starting sector within the device.
     <offset>: Starting sector within the device.
 
 
 One or more underlying devices can be specified. The striped device size must
 One or more underlying devices can be specified. The striped device size must
-be a multiple of the chunk size and a multiple of the number of underlying
-devices.
+be a multiple of the chunk size multiplied by the number of underlying devices.
 
 
 
 
 Example scripts
 Example scripts

+ 23 - 1
Documentation/device-mapper/thin-provisioning.txt

@@ -231,6 +231,9 @@ i) Constructor
       no_discard_passdown: Don't pass discards down to the underlying
       no_discard_passdown: Don't pass discards down to the underlying
 			   data device, but just remove the mapping.
 			   data device, but just remove the mapping.
 
 
+      read_only: Don't allow any changes to be made to the pool
+		 metadata.
+
     Data block size must be between 64KB (128 sectors) and 1GB
     Data block size must be between 64KB (128 sectors) and 1GB
     (2097152 sectors) inclusive.
     (2097152 sectors) inclusive.
 
 
@@ -239,7 +242,7 @@ ii) Status
 
 
     <transaction id> <used metadata blocks>/<total metadata blocks>
     <transaction id> <used metadata blocks>/<total metadata blocks>
     <used data blocks>/<total data blocks> <held metadata root>
     <used data blocks>/<total data blocks> <held metadata root>
-
+    [no_]discard_passdown ro|rw
 
 
     transaction id:
     transaction id:
 	A 64-bit number used by userspace to help synchronise with metadata
 	A 64-bit number used by userspace to help synchronise with metadata
@@ -257,6 +260,21 @@ ii) Status
 	held root.  This feature is not yet implemented so '-' is
 	held root.  This feature is not yet implemented so '-' is
 	always returned.
 	always returned.
 
 
+    discard_passdown|no_discard_passdown
+	Whether or not discards are actually being passed down to the
+	underlying device.  When this is enabled when loading the table,
+	it can get disabled if the underlying device doesn't support it.
+
+    ro|rw
+	If the pool encounters certain types of device failures it will
+	drop into a read-only metadata mode in which no changes to
+	the pool metadata (like allocating new blocks) are permitted.
+
+	In serious cases where even a read-only mode is deemed unsafe
+	no further I/O will be permitted and the status will just
+	contain the string 'Fail'.  The userspace recovery tools
+	should then be used.
+
 iii) Messages
 iii) Messages
 
 
     create_thin <dev id>
     create_thin <dev id>
@@ -329,3 +347,7 @@ regain some space then send the 'trim' message to the pool.
 ii) Status
 ii) Status
 
 
      <nr mapped sectors> <highest mapped sector>
      <nr mapped sectors> <highest mapped sector>
+
+	If the pool has encountered device errors and failed, the status
+	will just contain the string 'Fail'.  The userspace recovery
+	tools should then be used.

+ 0 - 9
drivers/md/Kconfig

@@ -260,15 +260,6 @@ config DM_DEBUG_BLOCK_STACK_TRACING
 
 
 	  If unsure, say N.
 	  If unsure, say N.
 
 
-config DM_DEBUG_SPACE_MAPS
-	boolean "Extra validation for thin provisioning space maps"
-	depends on DM_THIN_PROVISIONING
-	---help---
-	  Enable this for messages that may help debug problems with the
-	  space maps used by thin provisioning.
-
-          If unsure, say N.
-
 config DM_MIRROR
 config DM_MIRROR
        tristate "Mirror target"
        tristate "Mirror target"
        depends on BLK_DEV_DM
        depends on BLK_DEV_DM

+ 105 - 114
drivers/md/dm-crypt.c

@@ -42,21 +42,21 @@ struct convert_context {
 	unsigned int offset_out;
 	unsigned int offset_out;
 	unsigned int idx_in;
 	unsigned int idx_in;
 	unsigned int idx_out;
 	unsigned int idx_out;
-	sector_t sector;
-	atomic_t pending;
+	sector_t cc_sector;
+	atomic_t cc_pending;
 };
 };
 
 
 /*
 /*
  * per bio private data
  * per bio private data
  */
  */
 struct dm_crypt_io {
 struct dm_crypt_io {
-	struct dm_target *target;
+	struct crypt_config *cc;
 	struct bio *base_bio;
 	struct bio *base_bio;
 	struct work_struct work;
 	struct work_struct work;
 
 
 	struct convert_context ctx;
 	struct convert_context ctx;
 
 
-	atomic_t pending;
+	atomic_t io_pending;
 	int error;
 	int error;
 	sector_t sector;
 	sector_t sector;
 	struct dm_crypt_io *base_io;
 	struct dm_crypt_io *base_io;
@@ -109,9 +109,6 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
  */
  */
 struct crypt_cpu {
 struct crypt_cpu {
 	struct ablkcipher_request *req;
 	struct ablkcipher_request *req;
-	/* ESSIV: struct crypto_cipher *essiv_tfm */
-	void *iv_private;
-	struct crypto_ablkcipher *tfms[0];
 };
 };
 
 
 /*
 /*
@@ -151,6 +148,10 @@ struct crypt_config {
 	 * per_cpu_ptr() only.
 	 * per_cpu_ptr() only.
 	 */
 	 */
 	struct crypt_cpu __percpu *cpu;
 	struct crypt_cpu __percpu *cpu;
+
+	/* ESSIV: struct crypto_cipher *essiv_tfm */
+	void *iv_private;
+	struct crypto_ablkcipher **tfms;
 	unsigned tfms_count;
 	unsigned tfms_count;
 
 
 	/*
 	/*
@@ -193,7 +194,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
  */
  */
 static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 {
 {
-	return __this_cpu_ptr(cc->cpu)->tfms[0];
+	return cc->tfms[0];
 }
 }
 
 
 /*
 /*
@@ -258,7 +259,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	struct hash_desc desc;
 	struct hash_desc desc;
 	struct scatterlist sg;
 	struct scatterlist sg;
 	struct crypto_cipher *essiv_tfm;
 	struct crypto_cipher *essiv_tfm;
-	int err, cpu;
+	int err;
 
 
 	sg_init_one(&sg, cc->key, cc->key_size);
 	sg_init_one(&sg, cc->key, cc->key_size);
 	desc.tfm = essiv->hash_tfm;
 	desc.tfm = essiv->hash_tfm;
@@ -268,14 +269,12 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	if (err)
 	if (err)
 		return err;
 		return err;
 
 
-	for_each_possible_cpu(cpu) {
-		essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+	essiv_tfm = cc->iv_private;
 
 
-		err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
-				    crypto_hash_digestsize(essiv->hash_tfm));
-		if (err)
-			return err;
-	}
+	err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
+			    crypto_hash_digestsize(essiv->hash_tfm));
+	if (err)
+		return err;
 
 
 	return 0;
 	return 0;
 }
 }
@@ -286,16 +285,14 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
 	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
 	struct crypto_cipher *essiv_tfm;
 	struct crypto_cipher *essiv_tfm;
-	int cpu, r, err = 0;
+	int r, err = 0;
 
 
 	memset(essiv->salt, 0, salt_size);
 	memset(essiv->salt, 0, salt_size);
 
 
-	for_each_possible_cpu(cpu) {
-		essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
-		r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
-		if (r)
-			err = r;
-	}
+	essiv_tfm = cc->iv_private;
+	r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+	if (r)
+		err = r;
 
 
 	return err;
 	return err;
 }
 }
@@ -335,8 +332,6 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
 
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
 {
-	int cpu;
-	struct crypt_cpu *cpu_cc;
 	struct crypto_cipher *essiv_tfm;
 	struct crypto_cipher *essiv_tfm;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
 
@@ -346,15 +341,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 	kzfree(essiv->salt);
 	kzfree(essiv->salt);
 	essiv->salt = NULL;
 	essiv->salt = NULL;
 
 
-	for_each_possible_cpu(cpu) {
-		cpu_cc = per_cpu_ptr(cc->cpu, cpu);
-		essiv_tfm = cpu_cc->iv_private;
+	essiv_tfm = cc->iv_private;
 
 
-		if (essiv_tfm)
-			crypto_free_cipher(essiv_tfm);
+	if (essiv_tfm)
+		crypto_free_cipher(essiv_tfm);
 
 
-		cpu_cc->iv_private = NULL;
-	}
+	cc->iv_private = NULL;
 }
 }
 
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -363,7 +355,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	struct crypto_cipher *essiv_tfm = NULL;
 	struct crypto_cipher *essiv_tfm = NULL;
 	struct crypto_hash *hash_tfm = NULL;
 	struct crypto_hash *hash_tfm = NULL;
 	u8 *salt = NULL;
 	u8 *salt = NULL;
-	int err, cpu;
+	int err;
 
 
 	if (!opts) {
 	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
 		ti->error = "Digest algorithm missing for ESSIV mode";
@@ -388,15 +380,13 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	cc->iv_gen_private.essiv.salt = salt;
 	cc->iv_gen_private.essiv.salt = salt;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
 
-	for_each_possible_cpu(cpu) {
-		essiv_tfm = setup_essiv_cpu(cc, ti, salt,
-					crypto_hash_digestsize(hash_tfm));
-		if (IS_ERR(essiv_tfm)) {
-			crypt_iv_essiv_dtr(cc);
-			return PTR_ERR(essiv_tfm);
-		}
-		per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+	essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+				crypto_hash_digestsize(hash_tfm));
+	if (IS_ERR(essiv_tfm)) {
+		crypt_iv_essiv_dtr(cc);
+		return PTR_ERR(essiv_tfm);
 	}
 	}
+	cc->iv_private = essiv_tfm;
 
 
 	return 0;
 	return 0;
 
 
@@ -410,7 +400,7 @@ bad:
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
 			      struct dm_crypt_request *dmreq)
 			      struct dm_crypt_request *dmreq)
 {
 {
-	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
+	struct crypto_cipher *essiv_tfm = cc->iv_private;
 
 
 	memset(iv, 0, cc->iv_size);
 	memset(iv, 0, cc->iv_size);
 	*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
 	*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
@@ -664,7 +654,7 @@ static void crypt_convert_init(struct crypt_config *cc,
 	ctx->offset_out = 0;
 	ctx->offset_out = 0;
 	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
 	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
 	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
 	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
-	ctx->sector = sector + cc->iv_offset;
+	ctx->cc_sector = sector + cc->iv_offset;
 	init_completion(&ctx->restart);
 	init_completion(&ctx->restart);
 }
 }
 
 
@@ -695,12 +685,12 @@ static int crypt_convert_block(struct crypt_config *cc,
 	struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
 	struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
 	struct dm_crypt_request *dmreq;
 	struct dm_crypt_request *dmreq;
 	u8 *iv;
 	u8 *iv;
-	int r = 0;
+	int r;
 
 
 	dmreq = dmreq_of_req(cc, req);
 	dmreq = dmreq_of_req(cc, req);
 	iv = iv_of_dmreq(cc, dmreq);
 	iv = iv_of_dmreq(cc, dmreq);
 
 
-	dmreq->iv_sector = ctx->sector;
+	dmreq->iv_sector = ctx->cc_sector;
 	dmreq->ctx = ctx;
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
 	sg_init_table(&dmreq->sg_in, 1);
 	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
 	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -749,12 +739,12 @@ static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 			    struct convert_context *ctx)
 {
 {
 	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	struct crypt_cpu *this_cc = this_crypt_config(cc);
-	unsigned key_index = ctx->sector & (cc->tfms_count - 1);
+	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
 
 	if (!this_cc->req)
 	if (!this_cc->req)
 		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
 
-	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+	ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
 	ablkcipher_request_set_callback(this_cc->req,
 	ablkcipher_request_set_callback(this_cc->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
@@ -769,14 +759,14 @@ static int crypt_convert(struct crypt_config *cc,
 	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 	int r;
 
 
-	atomic_set(&ctx->pending, 1);
+	atomic_set(&ctx->cc_pending, 1);
 
 
 	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
 	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
 	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
 	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
 
 
 		crypt_alloc_req(cc, ctx);
 		crypt_alloc_req(cc, ctx);
 
 
-		atomic_inc(&ctx->pending);
+		atomic_inc(&ctx->cc_pending);
 
 
 		r = crypt_convert_block(cc, ctx, this_cc->req);
 		r = crypt_convert_block(cc, ctx, this_cc->req);
 
 
@@ -788,19 +778,19 @@ static int crypt_convert(struct crypt_config *cc,
 			/* fall through*/
 			/* fall through*/
 		case -EINPROGRESS:
 		case -EINPROGRESS:
 			this_cc->req = NULL;
 			this_cc->req = NULL;
-			ctx->sector++;
+			ctx->cc_sector++;
 			continue;
 			continue;
 
 
 		/* sync */
 		/* sync */
 		case 0:
 		case 0:
-			atomic_dec(&ctx->pending);
-			ctx->sector++;
+			atomic_dec(&ctx->cc_pending);
+			ctx->cc_sector++;
 			cond_resched();
 			cond_resched();
 			continue;
 			continue;
 
 
 		/* error */
 		/* error */
 		default:
 		default:
-			atomic_dec(&ctx->pending);
+			atomic_dec(&ctx->cc_pending);
 			return r;
 			return r;
 		}
 		}
 	}
 	}
@@ -811,7 +801,7 @@ static int crypt_convert(struct crypt_config *cc,
 static void dm_crypt_bio_destructor(struct bio *bio)
 static void dm_crypt_bio_destructor(struct bio *bio)
 {
 {
 	struct dm_crypt_io *io = bio->bi_private;
 	struct dm_crypt_io *io = bio->bi_private;
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 
 
 	bio_free(bio, cc->bs);
 	bio_free(bio, cc->bs);
 }
 }
@@ -825,7 +815,7 @@ static void dm_crypt_bio_destructor(struct bio *bio)
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 				      unsigned *out_of_pages)
 				      unsigned *out_of_pages)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 	struct bio *clone;
 	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
@@ -884,26 +874,25 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 	}
 	}
 }
 }
 
 
-static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti,
+static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
 					  struct bio *bio, sector_t sector)
 					  struct bio *bio, sector_t sector)
 {
 {
-	struct crypt_config *cc = ti->private;
 	struct dm_crypt_io *io;
 	struct dm_crypt_io *io;
 
 
 	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io = mempool_alloc(cc->io_pool, GFP_NOIO);
-	io->target = ti;
+	io->cc = cc;
 	io->base_bio = bio;
 	io->base_bio = bio;
 	io->sector = sector;
 	io->sector = sector;
 	io->error = 0;
 	io->error = 0;
 	io->base_io = NULL;
 	io->base_io = NULL;
-	atomic_set(&io->pending, 0);
+	atomic_set(&io->io_pending, 0);
 
 
 	return io;
 	return io;
 }
 }
 
 
 static void crypt_inc_pending(struct dm_crypt_io *io)
 static void crypt_inc_pending(struct dm_crypt_io *io)
 {
 {
-	atomic_inc(&io->pending);
+	atomic_inc(&io->io_pending);
 }
 }
 
 
 /*
 /*
@@ -913,12 +902,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
  */
  */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
 	struct bio *base_bio = io->base_bio;
 	struct dm_crypt_io *base_io = io->base_io;
 	struct dm_crypt_io *base_io = io->base_io;
 	int error = io->error;
 	int error = io->error;
 
 
-	if (!atomic_dec_and_test(&io->pending))
+	if (!atomic_dec_and_test(&io->io_pending))
 		return;
 		return;
 
 
 	mempool_free(io, cc->io_pool);
 	mempool_free(io, cc->io_pool);
@@ -952,7 +941,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 static void crypt_endio(struct bio *clone, int error)
 static void crypt_endio(struct bio *clone, int error)
 {
 {
 	struct dm_crypt_io *io = clone->bi_private;
 	struct dm_crypt_io *io = clone->bi_private;
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 	unsigned rw = bio_data_dir(clone);
 	unsigned rw = bio_data_dir(clone);
 
 
 	if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
 	if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
@@ -979,7 +968,7 @@ static void crypt_endio(struct bio *clone, int error)
 
 
 static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 
 
 	clone->bi_private = io;
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_end_io  = crypt_endio;
@@ -990,7 +979,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 	struct bio *clone;
 
 
@@ -1038,7 +1027,7 @@ static void kcryptd_io(struct work_struct *work)
 
 
 static void kcryptd_queue_io(struct dm_crypt_io *io)
 static void kcryptd_queue_io(struct dm_crypt_io *io)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 
 
 	INIT_WORK(&io->work, kcryptd_io);
 	INIT_WORK(&io->work, kcryptd_io);
 	queue_work(cc->io_queue, &io->work);
 	queue_work(cc->io_queue, &io->work);
@@ -1047,7 +1036,7 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 {
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct bio *clone = io->ctx.bio_out;
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 
 
 	if (unlikely(io->error < 0)) {
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
 		crypt_free_buffer_pages(cc, clone);
@@ -1069,7 +1058,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 	struct bio *clone;
 	struct bio *clone;
 	struct dm_crypt_io *new_io;
 	struct dm_crypt_io *new_io;
 	int crypt_finished;
 	int crypt_finished;
@@ -1107,7 +1096,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		if (r < 0)
 		if (r < 0)
 			io->error = -EIO;
 			io->error = -EIO;
 
 
-		crypt_finished = atomic_dec_and_test(&io->ctx.pending);
+		crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
 
 		/* Encryption was already finished, submit io now */
 		/* Encryption was already finished, submit io now */
 		if (crypt_finished) {
 		if (crypt_finished) {
@@ -1135,7 +1124,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		 * between fragments, so switch to a new dm_crypt_io structure.
 		 * between fragments, so switch to a new dm_crypt_io structure.
 		 */
 		 */
 		if (unlikely(!crypt_finished && remaining)) {
 		if (unlikely(!crypt_finished && remaining)) {
-			new_io = crypt_io_alloc(io->target, io->base_bio,
+			new_io = crypt_io_alloc(io->cc, io->base_bio,
 						sector);
 						sector);
 			crypt_inc_pending(new_io);
 			crypt_inc_pending(new_io);
 			crypt_convert_init(cc, &new_io->ctx, NULL,
 			crypt_convert_init(cc, &new_io->ctx, NULL,
@@ -1169,7 +1158,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
 
 
 static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 	int r = 0;
 	int r = 0;
 
 
 	crypt_inc_pending(io);
 	crypt_inc_pending(io);
@@ -1181,7 +1170,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 	if (r < 0)
 	if (r < 0)
 		io->error = -EIO;
 		io->error = -EIO;
 
 
-	if (atomic_dec_and_test(&io->ctx.pending))
+	if (atomic_dec_and_test(&io->ctx.cc_pending))
 		kcryptd_crypt_read_done(io);
 		kcryptd_crypt_read_done(io);
 
 
 	crypt_dec_pending(io);
 	crypt_dec_pending(io);
@@ -1193,7 +1182,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_request *dmreq = async_req->data;
 	struct dm_crypt_request *dmreq = async_req->data;
 	struct convert_context *ctx = dmreq->ctx;
 	struct convert_context *ctx = dmreq->ctx;
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 
 
 	if (error == -EINPROGRESS) {
 	if (error == -EINPROGRESS) {
 		complete(&ctx->restart);
 		complete(&ctx->restart);
@@ -1208,7 +1197,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 
 
 	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
 	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
 
 
-	if (!atomic_dec_and_test(&ctx->pending))
+	if (!atomic_dec_and_test(&ctx->cc_pending))
 		return;
 		return;
 
 
 	if (bio_data_dir(io->base_bio) == READ)
 	if (bio_data_dir(io->base_bio) == READ)
@@ -1229,7 +1218,7 @@ static void kcryptd_crypt(struct work_struct *work)
 
 
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
 {
-	struct crypt_config *cc = io->target->private;
+	struct crypt_config *cc = io->cc;
 
 
 	INIT_WORK(&io->work, kcryptd_crypt);
 	INIT_WORK(&io->work, kcryptd_crypt);
 	queue_work(cc->crypt_queue, &io->work);
 	queue_work(cc->crypt_queue, &io->work);
@@ -1241,7 +1230,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 {
 {
 	char buffer[3];
 	char buffer[3];
-	char *endp;
 	unsigned int i;
 	unsigned int i;
 
 
 	buffer[2] = '\0';
 	buffer[2] = '\0';
@@ -1250,9 +1238,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 		buffer[0] = *hex++;
 		buffer[0] = *hex++;
 		buffer[1] = *hex++;
 		buffer[1] = *hex++;
 
 
-		key[i] = (u8)simple_strtoul(buffer, &endp, 16);
-
-		if (endp != &buffer[2])
+		if (kstrtou8(buffer, 16, &key[i]))
 			return -EINVAL;
 			return -EINVAL;
 	}
 	}
 
 
@@ -1276,29 +1262,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 	}
 }
 }
 
 
-static void crypt_free_tfms(struct crypt_config *cc, int cpu)
+static void crypt_free_tfms(struct crypt_config *cc)
 {
 {
-	struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
 	unsigned i;
 	unsigned i;
 
 
+	if (!cc->tfms)
+		return;
+
 	for (i = 0; i < cc->tfms_count; i++)
 	for (i = 0; i < cc->tfms_count; i++)
-		if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
-			crypto_free_ablkcipher(cpu_cc->tfms[i]);
-			cpu_cc->tfms[i] = NULL;
+		if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
+			crypto_free_ablkcipher(cc->tfms[i]);
+			cc->tfms[i] = NULL;
 		}
 		}
+
+	kfree(cc->tfms);
+	cc->tfms = NULL;
 }
 }
 
 
-static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 {
 {
-	struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
 	unsigned i;
 	unsigned i;
 	int err;
 	int err;
 
 
+	cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
+			   GFP_KERNEL);
+	if (!cc->tfms)
+		return -ENOMEM;
+
 	for (i = 0; i < cc->tfms_count; i++) {
 	for (i = 0; i < cc->tfms_count; i++) {
-		cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
-		if (IS_ERR(cpu_cc->tfms[i])) {
-			err = PTR_ERR(cpu_cc->tfms[i]);
-			crypt_free_tfms(cc, cpu);
+		cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+		if (IS_ERR(cc->tfms[i])) {
+			err = PTR_ERR(cc->tfms[i]);
+			crypt_free_tfms(cc);
 			return err;
 			return err;
 		}
 		}
 	}
 	}
@@ -1309,15 +1304,14 @@ static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
 static int crypt_setkey_allcpus(struct crypt_config *cc)
 static int crypt_setkey_allcpus(struct crypt_config *cc)
 {
 {
 	unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
 	unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
-	int cpu, err = 0, i, r;
-
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < cc->tfms_count; i++) {
-			r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
-						     cc->key + (i * subkey_size), subkey_size);
-			if (r)
-				err = r;
-		}
+	int err = 0, i, r;
+
+	for (i = 0; i < cc->tfms_count; i++) {
+		r = crypto_ablkcipher_setkey(cc->tfms[i],
+					     cc->key + (i * subkey_size),
+					     subkey_size);
+		if (r)
+			err = r;
 	}
 	}
 
 
 	return err;
 	return err;
@@ -1379,9 +1373,10 @@ static void crypt_dtr(struct dm_target *ti)
 			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
 			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
 			if (cpu_cc->req)
 			if (cpu_cc->req)
 				mempool_free(cpu_cc->req, cc->req_pool);
 				mempool_free(cpu_cc->req, cc->req_pool);
-			crypt_free_tfms(cc, cpu);
 		}
 		}
 
 
+	crypt_free_tfms(cc);
+
 	if (cc->bs)
 	if (cc->bs)
 		bioset_free(cc->bs);
 		bioset_free(cc->bs);
 
 
@@ -1414,7 +1409,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	struct crypt_config *cc = ti->private;
 	struct crypt_config *cc = ti->private;
 	char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
 	char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
 	char *cipher_api = NULL;
 	char *cipher_api = NULL;
-	int cpu, ret = -EINVAL;
+	int ret = -EINVAL;
 	char dummy;
 	char dummy;
 
 
 	/* Convert to crypto api definition? */
 	/* Convert to crypto api definition? */
@@ -1455,8 +1450,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 		DMWARN("Ignoring unexpected additional cipher options");
 
 
-	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
-				 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
 				 __alignof__(struct crypt_cpu));
 				 __alignof__(struct crypt_cpu));
 	if (!cc->cpu) {
 	if (!cc->cpu) {
 		ti->error = "Cannot allocate per cpu state";
 		ti->error = "Cannot allocate per cpu state";
@@ -1489,12 +1483,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 	}
 
 
 	/* Allocate cipher */
 	/* Allocate cipher */
-	for_each_possible_cpu(cpu) {
-		ret = crypt_alloc_tfms(cc, cpu, cipher_api);
-		if (ret < 0) {
-			ti->error = "Error allocating crypto tfm";
-			goto bad;
-		}
+	ret = crypt_alloc_tfms(cc, cipher_api);
+	if (ret < 0) {
+		ti->error = "Error allocating crypto tfm";
+		goto bad;
 	}
 	}
 
 
 	/* Initialize and set key */
 	/* Initialize and set key */
@@ -1702,7 +1694,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	}
 
 
 	ti->num_flush_requests = 1;
 	ti->num_flush_requests = 1;
-	ti->discard_zeroes_data_unsupported = 1;
+	ti->discard_zeroes_data_unsupported = true;
 
 
 	return 0;
 	return 0;
 
 
@@ -1715,7 +1707,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 		     union map_info *map_context)
 		     union map_info *map_context)
 {
 {
 	struct dm_crypt_io *io;
 	struct dm_crypt_io *io;
-	struct crypt_config *cc;
+	struct crypt_config *cc = ti->private;
 
 
 	/*
 	/*
 	 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
 	 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
@@ -1723,14 +1715,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	 * - for REQ_DISCARD caller must use flush if IO ordering matters
 	 * - for REQ_DISCARD caller must use flush if IO ordering matters
 	 */
 	 */
 	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
 	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
-		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
 		bio->bi_bdev = cc->dev->bdev;
 		if (bio_sectors(bio))
 		if (bio_sectors(bio))
 			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
 			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
 		return DM_MAPIO_REMAPPED;
 		return DM_MAPIO_REMAPPED;
 	}
 	}
 
 
-	io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
+	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
 
 
 	if (bio_data_dir(io->base_bio) == READ) {
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
 		if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -1742,7 +1733,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 }
 }
 
 
 static int crypt_status(struct dm_target *ti, status_type_t type,
 static int crypt_status(struct dm_target *ti, status_type_t type,
-			char *result, unsigned int maxlen)
+			unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct crypt_config *cc = ti->private;
 	struct crypt_config *cc = ti->private;
 	unsigned int sz = 0;
 	unsigned int sz = 0;

+ 1 - 1
drivers/md/dm-delay.c

@@ -295,7 +295,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
 }
 }
 
 
 static int delay_status(struct dm_target *ti, status_type_t type,
 static int delay_status(struct dm_target *ti, status_type_t type,
-			char *result, unsigned maxlen)
+			unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct delay_c *dc = ti->private;
 	struct delay_c *dc = ti->private;
 	int sz = 0;
 	int sz = 0;

+ 4 - 9
drivers/md/dm-exception-store.c

@@ -142,24 +142,19 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister);
 static int set_chunk_size(struct dm_exception_store *store,
 static int set_chunk_size(struct dm_exception_store *store,
 			  const char *chunk_size_arg, char **error)
 			  const char *chunk_size_arg, char **error)
 {
 {
-	unsigned long chunk_size_ulong;
-	char *value;
+	unsigned chunk_size;
 
 
-	chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
-	if (*chunk_size_arg == '\0' || *value != '\0' ||
-	    chunk_size_ulong > UINT_MAX) {
+	if (kstrtouint(chunk_size_arg, 10, &chunk_size)) {
 		*error = "Invalid chunk size";
 		*error = "Invalid chunk size";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	if (!chunk_size_ulong) {
+	if (!chunk_size) {
 		store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
 		store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
 		return 0;
 		return 0;
 	}
 	}
 
 
-	return dm_exception_store_set_chunk_size(store,
-						 (unsigned) chunk_size_ulong,
-						 error);
+	return dm_exception_store_set_chunk_size(store, chunk_size, error);
 }
 }
 
 
 int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
 int dm_exception_store_set_chunk_size(struct dm_exception_store *store,

+ 1 - 1
drivers/md/dm-flakey.c

@@ -333,7 +333,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
 }
 }
 
 
 static int flakey_status(struct dm_target *ti, status_type_t type,
 static int flakey_status(struct dm_target *ti, status_type_t type,
-			 char *result, unsigned int maxlen)
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	unsigned sz = 0;
 	unsigned sz = 0;
 	struct flakey_c *fc = ti->private;
 	struct flakey_c *fc = ti->private;

+ 4 - 1
drivers/md/dm-ioctl.c

@@ -1054,6 +1054,7 @@ static void retrieve_status(struct dm_table *table,
 	char *outbuf, *outptr;
 	char *outbuf, *outptr;
 	status_type_t type;
 	status_type_t type;
 	size_t remaining, len, used = 0;
 	size_t remaining, len, used = 0;
+	unsigned status_flags = 0;
 
 
 	outptr = outbuf = get_result_buffer(param, param_size, &len);
 	outptr = outbuf = get_result_buffer(param, param_size, &len);
 
 
@@ -1090,7 +1091,9 @@ static void retrieve_status(struct dm_table *table,
 
 
 		/* Get the status/table string from the target driver */
 		/* Get the status/table string from the target driver */
 		if (ti->type->status) {
 		if (ti->type->status) {
-			if (ti->type->status(ti, type, outptr, remaining)) {
+			if (param->flags & DM_NOFLUSH_FLAG)
+				status_flags |= DM_STATUS_NOFLUSH_FLAG;
+			if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
 				param->flags |= DM_BUFFER_FULL_FLAG;
 				param->flags |= DM_BUFFER_FULL_FLAG;
 				break;
 				break;
 			}
 			}

+ 1 - 1
drivers/md/dm-linear.c

@@ -96,7 +96,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio,
 }
 }
 
 
 static int linear_status(struct dm_target *ti, status_type_t type,
 static int linear_status(struct dm_target *ti, status_type_t type,
-			 char *result, unsigned int maxlen)
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 	struct linear_c *lc = (struct linear_c *) ti->private;
 
 

+ 42 - 7
drivers/md/dm-mpath.c

@@ -85,6 +85,7 @@ struct multipath {
 	unsigned queue_io:1;		/* Must we queue all I/O? */
 	unsigned queue_io:1;		/* Must we queue all I/O? */
 	unsigned queue_if_no_path:1;	/* Queue I/O if last path fails? */
 	unsigned queue_if_no_path:1;	/* Queue I/O if last path fails? */
 	unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
 	unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
+	unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
 
 
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_count;		/* Number of times pg_init called */
 	unsigned pg_init_count;		/* Number of times pg_init called */
@@ -568,6 +569,8 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 	int r;
 	int r;
 	struct pgpath *p;
 	struct pgpath *p;
 	struct multipath *m = ti->private;
 	struct multipath *m = ti->private;
+	struct request_queue *q = NULL;
+	const char *attached_handler_name;
 
 
 	/* we need at least a path arg */
 	/* we need at least a path arg */
 	if (as->argc < 1) {
 	if (as->argc < 1) {
@@ -586,13 +589,37 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 		goto bad;
 		goto bad;
 	}
 	}
 
 
-	if (m->hw_handler_name) {
-		struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
+	if (m->retain_attached_hw_handler || m->hw_handler_name)
+		q = bdev_get_queue(p->path.dev->bdev);
+
+	if (m->retain_attached_hw_handler) {
+		attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
+		if (attached_handler_name) {
+			/*
+			 * Reset hw_handler_name to match the attached handler
+			 * and clear any hw_handler_params associated with the
+			 * ignored handler.
+			 *
+			 * NB. This modifies the table line to show the actual
+			 * handler instead of the original table passed in.
+			 */
+			kfree(m->hw_handler_name);
+			m->hw_handler_name = attached_handler_name;
+
+			kfree(m->hw_handler_params);
+			m->hw_handler_params = NULL;
+		}
+	}
 
 
+	if (m->hw_handler_name) {
+		/*
+		 * Increments scsi_dh reference, even when using an
+		 * already-attached handler.
+		 */
 		r = scsi_dh_attach(q, m->hw_handler_name);
 		r = scsi_dh_attach(q, m->hw_handler_name);
 		if (r == -EBUSY) {
 		if (r == -EBUSY) {
 			/*
 			/*
-			 * Already attached to different hw_handler,
+			 * Already attached to different hw_handler:
 			 * try to reattach with correct one.
 			 * try to reattach with correct one.
 			 */
 			 */
 			scsi_dh_detach(q);
 			scsi_dh_detach(q);
@@ -760,7 +787,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	const char *arg_name;
 	const char *arg_name;
 
 
 	static struct dm_arg _args[] = {
 	static struct dm_arg _args[] = {
-		{0, 5, "invalid number of feature args"},
+		{0, 6, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
 	};
@@ -781,6 +808,11 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 			continue;
 			continue;
 		}
 		}
 
 
+		if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
+			m->retain_attached_hw_handler = 1;
+			continue;
+		}
+
 		if (!strcasecmp(arg_name, "pg_init_retries") &&
 		if (!strcasecmp(arg_name, "pg_init_retries") &&
 		    (argc >= 1)) {
 		    (argc >= 1)) {
 			r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
 			r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
@@ -1346,7 +1378,7 @@ static void multipath_resume(struct dm_target *ti)
  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
  */
  */
 static int multipath_status(struct dm_target *ti, status_type_t type,
 static int multipath_status(struct dm_target *ti, status_type_t type,
-			    char *result, unsigned int maxlen)
+			    unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	int sz = 0;
 	int sz = 0;
 	unsigned long flags;
 	unsigned long flags;
@@ -1364,13 +1396,16 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 	else {
 	else {
 		DMEMIT("%u ", m->queue_if_no_path +
 		DMEMIT("%u ", m->queue_if_no_path +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_retries > 0) * 2 +
-			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
+			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
+			      m->retain_attached_hw_handler);
 		if (m->queue_if_no_path)
 		if (m->queue_if_no_path)
 			DMEMIT("queue_if_no_path ");
 			DMEMIT("queue_if_no_path ");
 		if (m->pg_init_retries)
 		if (m->pg_init_retries)
 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
 		if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
 		if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
+		if (m->retain_attached_hw_handler)
+			DMEMIT("retain_attached_hw_handler ");
 	}
 	}
 
 
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1656,7 +1691,7 @@ out:
  *---------------------------------------------------------------*/
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 static struct target_type multipath_target = {
 	.name = "multipath",
 	.name = "multipath",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
 	.dtr = multipath_dtr,

+ 35 - 21
drivers/md/dm-raid.c

@@ -101,20 +101,12 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 {
 {
 	unsigned i;
 	unsigned i;
 	struct raid_set *rs;
 	struct raid_set *rs;
-	sector_t sectors_per_dev;
 
 
 	if (raid_devs <= raid_type->parity_devs) {
 	if (raid_devs <= raid_type->parity_devs) {
 		ti->error = "Insufficient number of devices";
 		ti->error = "Insufficient number of devices";
 		return ERR_PTR(-EINVAL);
 		return ERR_PTR(-EINVAL);
 	}
 	}
 
 
-	sectors_per_dev = ti->len;
-	if ((raid_type->level > 1) &&
-	    sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
-		ti->error = "Target length not divisible by number of data devices";
-		return ERR_PTR(-EINVAL);
-	}
-
 	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
 	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
 	if (!rs) {
 	if (!rs) {
 		ti->error = "Cannot allocate raid context";
 		ti->error = "Cannot allocate raid context";
@@ -128,7 +120,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	rs->md.raid_disks = raid_devs;
 	rs->md.raid_disks = raid_devs;
 	rs->md.level = raid_type->level;
 	rs->md.level = raid_type->level;
 	rs->md.new_level = rs->md.level;
 	rs->md.new_level = rs->md.level;
-	rs->md.dev_sectors = sectors_per_dev;
 	rs->md.layout = raid_type->algorithm;
 	rs->md.layout = raid_type->algorithm;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.delta_disks = 0;
 	rs->md.delta_disks = 0;
@@ -143,6 +134,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	 *  rs->md.external
 	 *  rs->md.external
 	 *  rs->md.chunk_sectors
 	 *  rs->md.chunk_sectors
 	 *  rs->md.new_chunk_sectors
 	 *  rs->md.new_chunk_sectors
+	 *  rs->md.dev_sectors
 	 */
 	 */
 
 
 	return rs;
 	return rs;
@@ -353,6 +345,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 {
 {
 	unsigned i, rebuild_cnt = 0;
 	unsigned i, rebuild_cnt = 0;
 	unsigned long value, region_size = 0;
 	unsigned long value, region_size = 0;
+	sector_t sectors_per_dev = rs->ti->len;
+	sector_t max_io_len;
 	char *key;
 	char *key;
 
 
 	/*
 	/*
@@ -429,13 +423,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
 
 		if (!strcasecmp(key, "rebuild")) {
 		if (!strcasecmp(key, "rebuild")) {
 			rebuild_cnt++;
 			rebuild_cnt++;
-			if (((rs->raid_type->level != 1) &&
-			     (rebuild_cnt > rs->raid_type->parity_devs)) ||
-			    ((rs->raid_type->level == 1) &&
-			     (rebuild_cnt > (rs->md.raid_disks - 1)))) {
-				rs->ti->error = "Too many rebuild devices specified for given RAID type";
+
+			switch (rs->raid_type->level) {
+			case 1:
+				if (rebuild_cnt >= rs->md.raid_disks) {
+					rs->ti->error = "Too many rebuild devices specified";
+					return -EINVAL;
+				}
+				break;
+			case 4:
+			case 5:
+			case 6:
+				if (rebuild_cnt > rs->raid_type->parity_devs) {
+					rs->ti->error = "Too many rebuild devices specified for given RAID type";
+					return -EINVAL;
+				}
+				break;
+			default:
+				DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
+				rs->ti->error = "Rebuild not supported for this RAID type";
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
+
 			if (value > rs->md.raid_disks) {
 			if (value > rs->md.raid_disks) {
 				rs->ti->error = "Invalid rebuild index given";
 				rs->ti->error = "Invalid rebuild index given";
 				return -EINVAL;
 				return -EINVAL;
@@ -522,14 +531,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		return -EINVAL;
 		return -EINVAL;
 
 
 	if (rs->md.chunk_sectors)
 	if (rs->md.chunk_sectors)
-		rs->ti->split_io = rs->md.chunk_sectors;
+		max_io_len = rs->md.chunk_sectors;
 	else
 	else
-		rs->ti->split_io = region_size;
+		max_io_len = region_size;
 
 
-	if (rs->md.chunk_sectors)
-		rs->ti->split_io = rs->md.chunk_sectors;
-	else
-		rs->ti->split_io = region_size;
+	if (dm_set_target_max_io_len(rs->ti, max_io_len))
+		return -EINVAL;
+
+	if ((rs->raid_type->level > 1) &&
+	    sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
+		rs->ti->error = "Target length not divisible by number of data devices";
+		return -EINVAL;
+	}
+	rs->md.dev_sectors = sectors_per_dev;
 
 
 	/* Assume there are no metadata devices until the drives are parsed */
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.persistent = 0;
@@ -1067,7 +1081,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_c
 }
 }
 
 
 static int raid_status(struct dm_target *ti, status_type_t type,
 static int raid_status(struct dm_target *ti, status_type_t type,
-		       char *result, unsigned maxlen)
+		       unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct raid_set *rs = ti->private;
 	struct raid_set *rs = ti->private;
 	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
 	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */

+ 7 - 3
drivers/md/dm-raid1.c

@@ -1081,10 +1081,14 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	}
 
 
 	ti->private = ms;
 	ti->private = ms;
-	ti->split_io = dm_rh_get_region_size(ms->rh);
+
+	r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh));
+	if (r)
+		goto err_free_context;
+
 	ti->num_flush_requests = 1;
 	ti->num_flush_requests = 1;
 	ti->num_discard_requests = 1;
 	ti->num_discard_requests = 1;
-	ti->discard_zeroes_data_unsupported = 1;
+	ti->discard_zeroes_data_unsupported = true;
 
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord",
 	ms->kmirrord_wq = alloc_workqueue("kmirrord",
 					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
@@ -1363,7 +1367,7 @@ static char device_status_char(struct mirror *m)
 
 
 
 
 static int mirror_status(struct dm_target *ti, status_type_t type,
 static int mirror_status(struct dm_target *ti, status_type_t type,
-			 char *result, unsigned int maxlen)
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	unsigned int m, sz = 0;
 	unsigned int m, sz = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;

+ 18 - 16
drivers/md/dm-snap.c

@@ -691,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
  * Return a minimum chunk size of all snapshots that have the specified origin.
  * Return a minimum chunk size of all snapshots that have the specified origin.
  * Return zero if the origin has no snapshots.
  * Return zero if the origin has no snapshots.
  */
  */
-static sector_t __minimum_chunk_size(struct origin *o)
+static uint32_t __minimum_chunk_size(struct origin *o)
 {
 {
 	struct dm_snapshot *snap;
 	struct dm_snapshot *snap;
 	unsigned chunk_size = 0;
 	unsigned chunk_size = 0;
@@ -701,7 +701,7 @@ static sector_t __minimum_chunk_size(struct origin *o)
 			chunk_size = min_not_zero(chunk_size,
 			chunk_size = min_not_zero(chunk_size,
 						  snap->store->chunk_size);
 						  snap->store->chunk_size);
 
 
-	return chunk_size;
+	return (uint32_t) chunk_size;
 }
 }
 
 
 /*
 /*
@@ -1172,7 +1172,10 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Chunk size not set";
 		ti->error = "Chunk size not set";
 		goto bad_read_metadata;
 		goto bad_read_metadata;
 	}
 	}
-	ti->split_io = s->store->chunk_size;
+
+	r = dm_set_target_max_io_len(ti, s->store->chunk_size);
+	if (r)
+		goto bad_read_metadata;
 
 
 	return 0;
 	return 0;
 
 
@@ -1239,7 +1242,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
 	snap_dest->store->snap = snap_dest;
 	snap_dest->store->snap = snap_dest;
 	snap_src->store->snap = snap_src;
 	snap_src->store->snap = snap_src;
 
 
-	snap_dest->ti->split_io = snap_dest->store->chunk_size;
+	snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
 	snap_dest->valid = snap_src->valid;
 	snap_dest->valid = snap_src->valid;
 
 
 	/*
 	/*
@@ -1817,9 +1820,9 @@ static void snapshot_resume(struct dm_target *ti)
 	up_write(&s->lock);
 	up_write(&s->lock);
 }
 }
 
 
-static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
+static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
 {
 {
-	sector_t min_chunksize;
+	uint32_t min_chunksize;
 
 
 	down_read(&_origins_lock);
 	down_read(&_origins_lock);
 	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
 	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
@@ -1838,15 +1841,15 @@ static void snapshot_merge_resume(struct dm_target *ti)
 	snapshot_resume(ti);
 	snapshot_resume(ti);
 
 
 	/*
 	/*
-	 * snapshot-merge acts as an origin, so set ti->split_io
+	 * snapshot-merge acts as an origin, so set ti->max_io_len
 	 */
 	 */
-	ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
+	ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
 
 
 	start_merge(s);
 	start_merge(s);
 }
 }
 
 
 static int snapshot_status(struct dm_target *ti, status_type_t type,
 static int snapshot_status(struct dm_target *ti, status_type_t type,
-			   char *result, unsigned int maxlen)
+			   unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	unsigned sz = 0;
 	unsigned sz = 0;
 	struct dm_snapshot *snap = ti->private;
 	struct dm_snapshot *snap = ti->private;
@@ -2073,12 +2076,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
 	struct origin *o;
 	struct origin *o;
 
 
 	/*
 	/*
-	 * The origin's __minimum_chunk_size() got stored in split_io
+	 * The origin's __minimum_chunk_size() got stored in max_io_len
 	 * by snapshot_merge_resume().
 	 * by snapshot_merge_resume().
 	 */
 	 */
 	down_read(&_origins_lock);
 	down_read(&_origins_lock);
 	o = __lookup_origin(merging_snap->origin->bdev);
 	o = __lookup_origin(merging_snap->origin->bdev);
-	for (n = 0; n < size; n += merging_snap->ti->split_io)
+	for (n = 0; n < size; n += merging_snap->ti->max_io_len)
 		if (__origin_write(&o->snapshots, sector + n, NULL) ==
 		if (__origin_write(&o->snapshots, sector + n, NULL) ==
 		    DM_MAPIO_SUBMITTED)
 		    DM_MAPIO_SUBMITTED)
 			must_wait = 1;
 			must_wait = 1;
@@ -2138,18 +2141,18 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
 }
 }
 
 
 /*
 /*
- * Set the target "split_io" field to the minimum of all the snapshots'
+ * Set the target "max_io_len" field to the minimum of all the snapshots'
  * chunk sizes.
  * chunk sizes.
  */
  */
 static void origin_resume(struct dm_target *ti)
 static void origin_resume(struct dm_target *ti)
 {
 {
 	struct dm_dev *dev = ti->private;
 	struct dm_dev *dev = ti->private;
 
 
-	ti->split_io = get_origin_minimum_chunksize(dev->bdev);
+	ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
 }
 }
 
 
-static int origin_status(struct dm_target *ti, status_type_t type, char *result,
-			 unsigned int maxlen)
+static int origin_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct dm_dev *dev = ti->private;
 	struct dm_dev *dev = ti->private;
 
 
@@ -2176,7 +2179,6 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 		return max_size;
 		return max_size;
 
 
 	bvm->bi_bdev = dev->bdev;
 	bvm->bi_bdev = dev->bdev;
-	bvm->bi_sector = bvm->bi_sector;
 
 
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 }

+ 48 - 39
drivers/md/dm-stripe.c

@@ -26,14 +26,12 @@ struct stripe {
 struct stripe_c {
 struct stripe_c {
 	uint32_t stripes;
 	uint32_t stripes;
 	int stripes_shift;
 	int stripes_shift;
-	sector_t stripes_mask;
 
 
 	/* The size of this target / num. stripes */
 	/* The size of this target / num. stripes */
 	sector_t stripe_width;
 	sector_t stripe_width;
 
 
-	/* stripe chunk size */
-	uint32_t chunk_shift;
-	sector_t chunk_mask;
+	uint32_t chunk_size;
+	int chunk_size_shift;
 
 
 	/* Needed for handling events */
 	/* Needed for handling events */
 	struct dm_target *ti;
 	struct dm_target *ti;
@@ -91,7 +89,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
 
 
 /*
 /*
  * Construct a striped mapping.
  * Construct a striped mapping.
- * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
+ * <number of stripes> <chunk size> [<dev_path> <offset>]+
  */
  */
 static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 {
@@ -99,7 +97,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	sector_t width;
 	sector_t width;
 	uint32_t stripes;
 	uint32_t stripes;
 	uint32_t chunk_size;
 	uint32_t chunk_size;
-	char *end;
 	int r;
 	int r;
 	unsigned int i;
 	unsigned int i;
 
 
@@ -108,34 +105,23 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	stripes = simple_strtoul(argv[0], &end, 10);
-	if (!stripes || *end) {
+	if (kstrtouint(argv[0], 10, &stripes) || !stripes) {
 		ti->error = "Invalid stripe count";
 		ti->error = "Invalid stripe count";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	chunk_size = simple_strtoul(argv[1], &end, 10);
-	if (*end) {
+	if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) {
 		ti->error = "Invalid chunk_size";
 		ti->error = "Invalid chunk_size";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	/*
-	 * chunk_size is a power of two
-	 */
-	if (!is_power_of_2(chunk_size) ||
-	    (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
-		ti->error = "Invalid chunk size";
-		return -EINVAL;
-	}
-
-	if (ti->len & (chunk_size - 1)) {
+	width = ti->len;
+	if (sector_div(width, chunk_size)) {
 		ti->error = "Target length not divisible by "
 		ti->error = "Target length not divisible by "
 		    "chunk size";
 		    "chunk size";
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	width = ti->len;
 	if (sector_div(width, stripes)) {
 	if (sector_div(width, stripes)) {
 		ti->error = "Target length not divisible by "
 		ti->error = "Target length not divisible by "
 		    "number of stripes";
 		    "number of stripes";
@@ -167,17 +153,21 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 
 	if (stripes & (stripes - 1))
 	if (stripes & (stripes - 1))
 		sc->stripes_shift = -1;
 		sc->stripes_shift = -1;
-	else {
-		sc->stripes_shift = ffs(stripes) - 1;
-		sc->stripes_mask = ((sector_t) stripes) - 1;
-	}
+	else
+		sc->stripes_shift = __ffs(stripes);
+
+	r = dm_set_target_max_io_len(ti, chunk_size);
+	if (r)
+		return r;
 
 
-	ti->split_io = chunk_size;
 	ti->num_flush_requests = stripes;
 	ti->num_flush_requests = stripes;
 	ti->num_discard_requests = stripes;
 	ti->num_discard_requests = stripes;
 
 
-	sc->chunk_shift = ffs(chunk_size) - 1;
-	sc->chunk_mask = ((sector_t) chunk_size) - 1;
+	sc->chunk_size = chunk_size;
+	if (chunk_size & (chunk_size - 1))
+		sc->chunk_size_shift = -1;
+	else
+		sc->chunk_size_shift = __ffs(chunk_size);
 
 
 	/*
 	/*
 	 * Get the stripe destinations.
 	 * Get the stripe destinations.
@@ -216,17 +206,29 @@ static void stripe_dtr(struct dm_target *ti)
 static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
 static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
 			      uint32_t *stripe, sector_t *result)
 			      uint32_t *stripe, sector_t *result)
 {
 {
-	sector_t offset = dm_target_offset(sc->ti, sector);
-	sector_t chunk = offset >> sc->chunk_shift;
+	sector_t chunk = dm_target_offset(sc->ti, sector);
+	sector_t chunk_offset;
+
+	if (sc->chunk_size_shift < 0)
+		chunk_offset = sector_div(chunk, sc->chunk_size);
+	else {
+		chunk_offset = chunk & (sc->chunk_size - 1);
+		chunk >>= sc->chunk_size_shift;
+	}
 
 
 	if (sc->stripes_shift < 0)
 	if (sc->stripes_shift < 0)
 		*stripe = sector_div(chunk, sc->stripes);
 		*stripe = sector_div(chunk, sc->stripes);
 	else {
 	else {
-		*stripe = chunk & sc->stripes_mask;
+		*stripe = chunk & (sc->stripes - 1);
 		chunk >>= sc->stripes_shift;
 		chunk >>= sc->stripes_shift;
 	}
 	}
 
 
-	*result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask);
+	if (sc->chunk_size_shift < 0)
+		chunk *= sc->chunk_size;
+	else
+		chunk <<= sc->chunk_size_shift;
+
+	*result = chunk + chunk_offset;
 }
 }
 
 
 static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
 static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
@@ -237,9 +239,16 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
 	stripe_map_sector(sc, sector, &stripe, result);
 	stripe_map_sector(sc, sector, &stripe, result);
 	if (stripe == target_stripe)
 	if (stripe == target_stripe)
 		return;
 		return;
-	*result &= ~sc->chunk_mask;			/* round down */
+
+	/* round down */
+	sector = *result;
+	if (sc->chunk_size_shift < 0)
+		*result -= sector_div(sector, sc->chunk_size);
+	else
+		*result = sector & ~(sector_t)(sc->chunk_size - 1);
+
 	if (target_stripe < stripe)
 	if (target_stripe < stripe)
-		*result += sc->chunk_mask + 1;		/* next chunk */
+		*result += sc->chunk_size;		/* next chunk */
 }
 }
 
 
 static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
 static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
@@ -302,8 +311,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
  *
  *
  */
  */
 
 
-static int stripe_status(struct dm_target *ti,
-			 status_type_t type, char *result, unsigned int maxlen)
+static int stripe_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 	char buffer[sc->stripes + 1];
 	char buffer[sc->stripes + 1];
@@ -324,7 +333,7 @@ static int stripe_status(struct dm_target *ti,
 
 
 	case STATUSTYPE_TABLE:
 	case STATUSTYPE_TABLE:
 		DMEMIT("%d %llu", sc->stripes,
 		DMEMIT("%d %llu", sc->stripes,
-			(unsigned long long)sc->chunk_mask + 1);
+			(unsigned long long)sc->chunk_size);
 		for (i = 0; i < sc->stripes; i++)
 		for (i = 0; i < sc->stripes; i++)
 			DMEMIT(" %s %llu", sc->stripe[i].dev->name,
 			DMEMIT(" %s %llu", sc->stripe[i].dev->name,
 			    (unsigned long long)sc->stripe[i].physical_start);
 			    (unsigned long long)sc->stripe[i].physical_start);
@@ -391,7 +400,7 @@ static void stripe_io_hints(struct dm_target *ti,
 			    struct queue_limits *limits)
 			    struct queue_limits *limits)
 {
 {
 	struct stripe_c *sc = ti->private;
 	struct stripe_c *sc = ti->private;
-	unsigned chunk_size = (sc->chunk_mask + 1) << 9;
+	unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT;
 
 
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_opt(limits, chunk_size * sc->stripes);
 	blk_limits_io_opt(limits, chunk_size * sc->stripes);
@@ -419,7 +428,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
 
 static struct target_type stripe_target = {
 static struct target_type stripe_target = {
 	.name   = "striped",
 	.name   = "striped",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
 	.dtr    = stripe_dtr,

+ 3 - 0
drivers/md/dm-table.c

@@ -1319,6 +1319,9 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 		if (!ti->num_flush_requests)
 		if (!ti->num_flush_requests)
 			continue;
 			continue;
 
 
+		if (ti->flush_supported)
+			return 1;
+
 		if (ti->type->iterate_devices &&
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
 		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
 			return 1;
 			return 1;

+ 461 - 308
drivers/md/dm-thin-metadata.c

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright (C) 2011 Red Hat, Inc.
+ * Copyright (C) 2011-2012 Red Hat, Inc.
  *
  *
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
@@ -80,6 +80,12 @@
 #define THIN_METADATA_CACHE_SIZE 64
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
 #define SECTOR_TO_BLOCK_SHIFT 3
 
 
+/*
+ *  3 for btree insert +
+ *  2 for btree lookup used within space map
+ */
+#define THIN_MAX_CONCURRENT_LOCKS 5
+
 /* This should be plenty */
 /* This should be plenty */
 #define SPACE_MAP_ROOT_SIZE 128
 #define SPACE_MAP_ROOT_SIZE 128
 
 
@@ -172,13 +178,20 @@ struct dm_pool_metadata {
 
 
 	struct rw_semaphore root_lock;
 	struct rw_semaphore root_lock;
 	uint32_t time;
 	uint32_t time;
-	int need_commit;
 	dm_block_t root;
 	dm_block_t root;
 	dm_block_t details_root;
 	dm_block_t details_root;
 	struct list_head thin_devices;
 	struct list_head thin_devices;
 	uint64_t trans_id;
 	uint64_t trans_id;
 	unsigned long flags;
 	unsigned long flags;
 	sector_t data_block_size;
 	sector_t data_block_size;
+	bool read_only:1;
+
+	/*
+	 * Set if a transaction has to be aborted but the attempt to roll back
+	 * to the previous (good) transaction failed.  The only pool metadata
+	 * operation possible in this state is the closing of the device.
+	 */
+	bool fail_io:1;
 };
 };
 
 
 struct dm_thin_device {
 struct dm_thin_device {
@@ -187,7 +200,8 @@ struct dm_thin_device {
 	dm_thin_id id;
 	dm_thin_id id;
 
 
 	int open_count;
 	int open_count;
-	int changed;
+	bool changed:1;
+	bool aborted_with_changes:1;
 	uint64_t mapped_blocks;
 	uint64_t mapped_blocks;
 	uint64_t transaction_id;
 	uint64_t transaction_id;
 	uint32_t creation_time;
 	uint32_t creation_time;
@@ -338,7 +352,21 @@ static int subtree_equal(void *context, void *value1_le, void *value2_le)
 
 
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/
 
 
-static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+static int superblock_lock_zero(struct dm_pool_metadata *pmd,
+				struct dm_block **sblock)
+{
+	return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+				     &sb_validator, sblock);
+}
+
+static int superblock_lock(struct dm_pool_metadata *pmd,
+			   struct dm_block **sblock)
+{
+	return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+				&sb_validator, sblock);
+}
+
+static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
 {
 {
 	int r;
 	int r;
 	unsigned i;
 	unsigned i;
@@ -365,72 +393,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
 	return dm_bm_unlock(b);
 	return dm_bm_unlock(b);
 }
 }
 
 
-static int init_pmd(struct dm_pool_metadata *pmd,
-		    struct dm_block_manager *bm,
-		    dm_block_t nr_blocks, int create)
+static void __setup_btree_details(struct dm_pool_metadata *pmd)
 {
 {
-	int r;
-	struct dm_space_map *sm, *data_sm;
-	struct dm_transaction_manager *tm;
-	struct dm_block *sblock;
-
-	if (create) {
-		r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
-					 &sb_validator, &tm, &sm, &sblock);
-		if (r < 0) {
-			DMERR("tm_create_with_sm failed");
-			return r;
-		}
-
-		data_sm = dm_sm_disk_create(tm, nr_blocks);
-		if (IS_ERR(data_sm)) {
-			DMERR("sm_disk_create failed");
-			dm_tm_unlock(tm, sblock);
-			r = PTR_ERR(data_sm);
-			goto bad;
-		}
-	} else {
-		struct thin_disk_superblock *disk_super = NULL;
-		size_t space_map_root_offset =
-			offsetof(struct thin_disk_superblock, metadata_space_map_root);
-
-		r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
-				       &sb_validator, space_map_root_offset,
-				       SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
-		if (r < 0) {
-			DMERR("tm_open_with_sm failed");
-			return r;
-		}
-
-		disk_super = dm_block_data(sblock);
-		data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root,
-					  sizeof(disk_super->data_space_map_root));
-		if (IS_ERR(data_sm)) {
-			DMERR("sm_disk_open failed");
-			r = PTR_ERR(data_sm);
-			goto bad;
-		}
-	}
-
-
-	r = dm_tm_unlock(tm, sblock);
-	if (r < 0) {
-		DMERR("couldn't unlock superblock");
-		goto bad_data_sm;
-	}
-
-	pmd->bm = bm;
-	pmd->metadata_sm = sm;
-	pmd->data_sm = data_sm;
-	pmd->tm = tm;
-	pmd->nb_tm = dm_tm_create_non_blocking_clone(tm);
-	if (!pmd->nb_tm) {
-		DMERR("could not create clone tm");
-		r = -ENOMEM;
-		goto bad_data_sm;
-	}
-
-	pmd->info.tm = tm;
+	pmd->info.tm = pmd->tm;
 	pmd->info.levels = 2;
 	pmd->info.levels = 2;
 	pmd->info.value_type.context = pmd->data_sm;
 	pmd->info.value_type.context = pmd->data_sm;
 	pmd->info.value_type.size = sizeof(__le64);
 	pmd->info.value_type.size = sizeof(__le64);
@@ -441,7 +406,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
 	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
 	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
 	pmd->nb_info.tm = pmd->nb_tm;
 	pmd->nb_info.tm = pmd->nb_tm;
 
 
-	pmd->tl_info.tm = tm;
+	pmd->tl_info.tm = pmd->tm;
 	pmd->tl_info.levels = 1;
 	pmd->tl_info.levels = 1;
 	pmd->tl_info.value_type.context = &pmd->info;
 	pmd->tl_info.value_type.context = &pmd->info;
 	pmd->tl_info.value_type.size = sizeof(__le64);
 	pmd->tl_info.value_type.size = sizeof(__le64);
@@ -449,7 +414,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
 	pmd->tl_info.value_type.dec = subtree_dec;
 	pmd->tl_info.value_type.dec = subtree_dec;
 	pmd->tl_info.value_type.equal = subtree_equal;
 	pmd->tl_info.value_type.equal = subtree_equal;
 
 
-	pmd->bl_info.tm = tm;
+	pmd->bl_info.tm = pmd->tm;
 	pmd->bl_info.levels = 1;
 	pmd->bl_info.levels = 1;
 	pmd->bl_info.value_type.context = pmd->data_sm;
 	pmd->bl_info.value_type.context = pmd->data_sm;
 	pmd->bl_info.value_type.size = sizeof(__le64);
 	pmd->bl_info.value_type.size = sizeof(__le64);
@@ -457,47 +422,265 @@ static int init_pmd(struct dm_pool_metadata *pmd,
 	pmd->bl_info.value_type.dec = data_block_dec;
 	pmd->bl_info.value_type.dec = data_block_dec;
 	pmd->bl_info.value_type.equal = data_block_equal;
 	pmd->bl_info.value_type.equal = data_block_equal;
 
 
-	pmd->details_info.tm = tm;
+	pmd->details_info.tm = pmd->tm;
 	pmd->details_info.levels = 1;
 	pmd->details_info.levels = 1;
 	pmd->details_info.value_type.context = NULL;
 	pmd->details_info.value_type.context = NULL;
 	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
 	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
 	pmd->details_info.value_type.inc = NULL;
 	pmd->details_info.value_type.inc = NULL;
 	pmd->details_info.value_type.dec = NULL;
 	pmd->details_info.value_type.dec = NULL;
 	pmd->details_info.value_type.equal = NULL;
 	pmd->details_info.value_type.equal = NULL;
+}
 
 
-	pmd->root = 0;
+static int __write_initial_superblock(struct dm_pool_metadata *pmd)
+{
+	int r;
+	struct dm_block *sblock;
+	size_t metadata_len, data_len;
+	struct thin_disk_superblock *disk_super;
+	sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
 
 
-	init_rwsem(&pmd->root_lock);
-	pmd->time = 0;
-	pmd->need_commit = 0;
-	pmd->details_root = 0;
-	pmd->trans_id = 0;
-	pmd->flags = 0;
-	INIT_LIST_HEAD(&pmd->thin_devices);
+	if (bdev_size > THIN_METADATA_MAX_SECTORS)
+		bdev_size = THIN_METADATA_MAX_SECTORS;
+
+	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
+	if (r < 0)
+		return r;
+
+	r = dm_sm_root_size(pmd->data_sm, &data_len);
+	if (r < 0)
+		return r;
+
+	r = dm_sm_commit(pmd->data_sm);
+	if (r < 0)
+		return r;
+
+	r = dm_tm_pre_commit(pmd->tm);
+	if (r < 0)
+		return r;
+
+	r = superblock_lock_zero(pmd, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = 0;
+	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
+	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
+	disk_super->version = cpu_to_le32(THIN_VERSION);
+	disk_super->time = 0;
+	disk_super->trans_id = 0;
+	disk_super->held_root = 0;
+
+	r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
+			    metadata_len);
+	if (r < 0)
+		goto bad_locked;
+
+	r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
+			    data_len);
+	if (r < 0)
+		goto bad_locked;
+
+	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
+	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
+	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
+	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
+
+	return dm_tm_commit(pmd->tm, sblock);
+
+bad_locked:
+	dm_bm_unlock(sblock);
+	return r;
+}
+
+static int __format_metadata(struct dm_pool_metadata *pmd)
+{
+	int r;
+
+	r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+				 &pmd->tm, &pmd->metadata_sm);
+	if (r < 0) {
+		DMERR("tm_create_with_sm failed");
+		return r;
+	}
+
+	pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
+	if (IS_ERR(pmd->data_sm)) {
+		DMERR("sm_disk_create failed");
+		r = PTR_ERR(pmd->data_sm);
+		goto bad_cleanup_tm;
+	}
+
+	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
+	if (!pmd->nb_tm) {
+		DMERR("could not create non-blocking clone tm");
+		r = -ENOMEM;
+		goto bad_cleanup_data_sm;
+	}
+
+	__setup_btree_details(pmd);
+
+	r = dm_btree_empty(&pmd->info, &pmd->root);
+	if (r < 0)
+		goto bad_cleanup_nb_tm;
+
+	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
+	if (r < 0) {
+		DMERR("couldn't create devices root");
+		goto bad_cleanup_nb_tm;
+	}
+
+	r = __write_initial_superblock(pmd);
+	if (r)
+		goto bad_cleanup_nb_tm;
 
 
 	return 0;
 	return 0;
 
 
-bad_data_sm:
-	dm_sm_destroy(data_sm);
-bad:
-	dm_tm_destroy(tm);
-	dm_sm_destroy(sm);
+bad_cleanup_nb_tm:
+	dm_tm_destroy(pmd->nb_tm);
+bad_cleanup_data_sm:
+	dm_sm_destroy(pmd->data_sm);
+bad_cleanup_tm:
+	dm_tm_destroy(pmd->tm);
+	dm_sm_destroy(pmd->metadata_sm);
+
+	return r;
+}
+
+static int __check_incompat_features(struct thin_disk_superblock *disk_super,
+				     struct dm_pool_metadata *pmd)
+{
+	uint32_t features;
+
+	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		DMERR("could not access metadata due to unsupported optional features (%lx).",
+		      (unsigned long)features);
+		return -EINVAL;
+	}
+
+	/*
+	 * Check for read-only metadata to skip the following RDWR checks.
+	 */
+	if (get_disk_ro(pmd->bdev->bd_disk))
+		return 0;
+
+	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
+	if (features) {
+		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
+		      (unsigned long)features);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __open_metadata(struct dm_pool_metadata *pmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct thin_disk_superblock *disk_super;
+
+	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			    &sb_validator, &sblock);
+	if (r < 0) {
+		DMERR("couldn't read superblock");
+		return r;
+	}
+
+	disk_super = dm_block_data(sblock);
+
+	r = __check_incompat_features(disk_super, pmd);
+	if (r < 0)
+		goto bad_unlock_sblock;
+
+	r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			       disk_super->metadata_space_map_root,
+			       sizeof(disk_super->metadata_space_map_root),
+			       &pmd->tm, &pmd->metadata_sm);
+	if (r < 0) {
+		DMERR("tm_open_with_sm failed");
+		goto bad_unlock_sblock;
+	}
+
+	pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
+				       sizeof(disk_super->data_space_map_root));
+	if (IS_ERR(pmd->data_sm)) {
+		DMERR("sm_disk_open failed");
+		r = PTR_ERR(pmd->data_sm);
+		goto bad_cleanup_tm;
+	}
+
+	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
+	if (!pmd->nb_tm) {
+		DMERR("could not create non-blocking clone tm");
+		r = -ENOMEM;
+		goto bad_cleanup_data_sm;
+	}
+
+	__setup_btree_details(pmd);
+	return dm_bm_unlock(sblock);
+
+bad_cleanup_data_sm:
+	dm_sm_destroy(pmd->data_sm);
+bad_cleanup_tm:
+	dm_tm_destroy(pmd->tm);
+	dm_sm_destroy(pmd->metadata_sm);
+bad_unlock_sblock:
+	dm_bm_unlock(sblock);
+
+	return r;
+}
+
+static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
+{
+	int r, unformatted;
+
+	r = __superblock_all_zeroes(pmd->bm, &unformatted);
+	if (r)
+		return r;
+
+	if (unformatted)
+		return format_device ? __format_metadata(pmd) : -EPERM;
+
+	return __open_metadata(pmd);
+}
+
+static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
+{
+	int r;
+
+	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
+					  THIN_METADATA_CACHE_SIZE,
+					  THIN_MAX_CONCURRENT_LOCKS);
+	if (IS_ERR(pmd->bm)) {
+		DMERR("could not create block manager");
+		return PTR_ERR(pmd->bm);
+	}
+
+	r = __open_or_format_metadata(pmd, format_device);
+	if (r)
+		dm_block_manager_destroy(pmd->bm);
 
 
 	return r;
 	return r;
 }
 }
 
 
+static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
+{
+	dm_sm_destroy(pmd->data_sm);
+	dm_sm_destroy(pmd->metadata_sm);
+	dm_tm_destroy(pmd->nb_tm);
+	dm_tm_destroy(pmd->tm);
+	dm_block_manager_destroy(pmd->bm);
+}
+
 static int __begin_transaction(struct dm_pool_metadata *pmd)
 static int __begin_transaction(struct dm_pool_metadata *pmd)
 {
 {
 	int r;
 	int r;
-	u32 features;
 	struct thin_disk_superblock *disk_super;
 	struct thin_disk_superblock *disk_super;
 	struct dm_block *sblock;
 	struct dm_block *sblock;
 
 
-	/*
-	 * __maybe_commit_transaction() resets these
-	 */
-	WARN_ON(pmd->need_commit);
-
 	/*
 	/*
 	 * We re-read the superblock every time.  Shouldn't need to do this
 	 * We re-read the superblock every time.  Shouldn't need to do this
 	 * really.
 	 * really.
@@ -515,32 +698,8 @@ static int __begin_transaction(struct dm_pool_metadata *pmd)
 	pmd->flags = le32_to_cpu(disk_super->flags);
 	pmd->flags = le32_to_cpu(disk_super->flags);
 	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
 	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
 
 
-	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
-	if (features) {
-		DMERR("could not access metadata due to "
-		      "unsupported optional features (%lx).",
-		      (unsigned long)features);
-		r = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Check for read-only metadata to skip the following RDWR checks.
-	 */
-	if (get_disk_ro(pmd->bdev->bd_disk))
-		goto out;
-
-	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
-	if (features) {
-		DMERR("could not access metadata RDWR due to "
-		      "unsupported optional features (%lx).",
-		      (unsigned long)features);
-		r = -EINVAL;
-	}
-
-out:
 	dm_bm_unlock(sblock);
 	dm_bm_unlock(sblock);
-	return r;
+	return 0;
 }
 }
 
 
 static int __write_changed_details(struct dm_pool_metadata *pmd)
 static int __write_changed_details(struct dm_pool_metadata *pmd)
@@ -573,8 +732,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
 			list_del(&td->list);
 			list_del(&td->list);
 			kfree(td);
 			kfree(td);
 		}
 		}
-
-		pmd->need_commit = 1;
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -582,9 +739,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
 
 
 static int __commit_transaction(struct dm_pool_metadata *pmd)
 static int __commit_transaction(struct dm_pool_metadata *pmd)
 {
 {
-	/*
-	 * FIXME: Associated pool should be made read-only on failure.
-	 */
 	int r;
 	int r;
 	size_t metadata_len, data_len;
 	size_t metadata_len, data_len;
 	struct thin_disk_superblock *disk_super;
 	struct thin_disk_superblock *disk_super;
@@ -597,31 +751,27 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 
 
 	r = __write_changed_details(pmd);
 	r = __write_changed_details(pmd);
 	if (r < 0)
 	if (r < 0)
-		goto out;
-
-	if (!pmd->need_commit)
-		goto out;
+		return r;
 
 
 	r = dm_sm_commit(pmd->data_sm);
 	r = dm_sm_commit(pmd->data_sm);
 	if (r < 0)
 	if (r < 0)
-		goto out;
+		return r;
 
 
 	r = dm_tm_pre_commit(pmd->tm);
 	r = dm_tm_pre_commit(pmd->tm);
 	if (r < 0)
 	if (r < 0)
-		goto out;
+		return r;
 
 
 	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
 	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
 	if (r < 0)
 	if (r < 0)
-		goto out;
+		return r;
 
 
 	r = dm_sm_root_size(pmd->data_sm, &data_len);
 	r = dm_sm_root_size(pmd->data_sm, &data_len);
 	if (r < 0)
 	if (r < 0)
-		goto out;
+		return r;
 
 
-	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-			     &sb_validator, &sblock);
+	r = superblock_lock(pmd, &sblock);
 	if (r)
 	if (r)
-		goto out;
+		return r;
 
 
 	disk_super = dm_block_data(sblock);
 	disk_super = dm_block_data(sblock);
 	disk_super->time = cpu_to_le32(pmd->time);
 	disk_super->time = cpu_to_le32(pmd->time);
@@ -640,12 +790,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	if (r < 0)
 	if (r < 0)
 		goto out_locked;
 		goto out_locked;
 
 
-	r = dm_tm_commit(pmd->tm, sblock);
-	if (!r)
-		pmd->need_commit = 0;
-
-out:
-	return r;
+	return dm_tm_commit(pmd->tm, sblock);
 
 
 out_locked:
 out_locked:
 	dm_bm_unlock(sblock);
 	dm_bm_unlock(sblock);
@@ -653,15 +798,11 @@ out_locked:
 }
 }
 
 
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
-					       sector_t data_block_size)
+					       sector_t data_block_size,
+					       bool format_device)
 {
 {
 	int r;
 	int r;
-	struct thin_disk_superblock *disk_super;
 	struct dm_pool_metadata *pmd;
 	struct dm_pool_metadata *pmd;
-	sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
-	struct dm_block_manager *bm;
-	int create;
-	struct dm_block *sblock;
 
 
 	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
 	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
 	if (!pmd) {
 	if (!pmd) {
@@ -669,90 +810,28 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 	}
 	}
 
 
-	/*
-	 * Max hex locks:
-	 *  3 for btree insert +
-	 *  2 for btree lookup used within space map
-	 */
-	bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE,
-				     THIN_METADATA_CACHE_SIZE, 5);
-	if (!bm) {
-		DMERR("could not create block manager");
-		kfree(pmd);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	r = superblock_all_zeroes(bm, &create);
-	if (r) {
-		dm_block_manager_destroy(bm);
-		kfree(pmd);
-		return ERR_PTR(r);
-	}
-
+	init_rwsem(&pmd->root_lock);
+	pmd->time = 0;
+	INIT_LIST_HEAD(&pmd->thin_devices);
+	pmd->read_only = false;
+	pmd->fail_io = false;
+	pmd->bdev = bdev;
+	pmd->data_block_size = data_block_size;
 
 
-	r = init_pmd(pmd, bm, 0, create);
+	r = __create_persistent_data_objects(pmd, format_device);
 	if (r) {
 	if (r) {
-		dm_block_manager_destroy(bm);
 		kfree(pmd);
 		kfree(pmd);
 		return ERR_PTR(r);
 		return ERR_PTR(r);
 	}
 	}
-	pmd->bdev = bdev;
-
-	if (!create) {
-		r = __begin_transaction(pmd);
-		if (r < 0)
-			goto bad;
-		return pmd;
-	}
-
-	/*
-	 * Create.
-	 */
-	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-			     &sb_validator, &sblock);
-	if (r)
-		goto bad;
-
-	if (bdev_size > THIN_METADATA_MAX_SECTORS)
-		bdev_size = THIN_METADATA_MAX_SECTORS;
-
-	disk_super = dm_block_data(sblock);
-	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
-	disk_super->version = cpu_to_le32(THIN_VERSION);
-	disk_super->time = 0;
-	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
-	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
-	disk_super->data_block_size = cpu_to_le32(data_block_size);
-
-	r = dm_bm_unlock(sblock);
-	if (r < 0)
-		goto bad;
-
-	r = dm_btree_empty(&pmd->info, &pmd->root);
-	if (r < 0)
-		goto bad;
-
-	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
-	if (r < 0) {
-		DMERR("couldn't create devices root");
-		goto bad;
-	}
 
 
-	pmd->flags = 0;
-	pmd->need_commit = 1;
-	r = dm_pool_commit_metadata(pmd);
+	r = __begin_transaction(pmd);
 	if (r < 0) {
 	if (r < 0) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
-		goto bad;
+		if (dm_pool_metadata_close(pmd) < 0)
+			DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
+		return ERR_PTR(r);
 	}
 	}
 
 
 	return pmd;
 	return pmd;
-
-bad:
-	if (dm_pool_metadata_close(pmd) < 0)
-		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
-	return ERR_PTR(r);
 }
 }
 
 
 int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
@@ -778,18 +857,17 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 		return -EBUSY;
 		return -EBUSY;
 	}
 	}
 
 
-	r = __commit_transaction(pmd);
-	if (r < 0)
-		DMWARN("%s: __commit_transaction() failed, error = %d",
-		       __func__, r);
+	if (!pmd->read_only && !pmd->fail_io) {
+		r = __commit_transaction(pmd);
+		if (r < 0)
+			DMWARN("%s: __commit_transaction() failed, error = %d",
+			       __func__, r);
+	}
 
 
-	dm_tm_destroy(pmd->tm);
-	dm_tm_destroy(pmd->nb_tm);
-	dm_block_manager_destroy(pmd->bm);
-	dm_sm_destroy(pmd->metadata_sm);
-	dm_sm_destroy(pmd->data_sm);
-	kfree(pmd);
+	if (!pmd->fail_io)
+		__destroy_persistent_data_objects(pmd);
 
 
+	kfree(pmd);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -850,6 +928,7 @@ static int __open_device(struct dm_pool_metadata *pmd,
 	(*td)->id = dev;
 	(*td)->id = dev;
 	(*td)->open_count = 1;
 	(*td)->open_count = 1;
 	(*td)->changed = changed;
 	(*td)->changed = changed;
+	(*td)->aborted_with_changes = false;
 	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
 	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
 	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
 	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
 	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
 	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
@@ -911,10 +990,11 @@ static int __create_thin(struct dm_pool_metadata *pmd,
 
 
 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __create_thin(pmd, dev);
+	if (!pmd->fail_io)
+		r = __create_thin(pmd, dev);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1001,10 +1081,11 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
 				 dm_thin_id dev,
 				 dm_thin_id dev,
 				 dm_thin_id origin)
 				 dm_thin_id origin)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __create_snap(pmd, dev, origin);
+	if (!pmd->fail_io)
+		r = __create_snap(pmd, dev, origin);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1037,18 +1118,17 @@ static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
-	pmd->need_commit = 1;
-
 	return 0;
 	return 0;
 }
 }
 
 
 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
 			       dm_thin_id dev)
 			       dm_thin_id dev)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __delete_device(pmd, dev);
+	if (!pmd->fail_io)
+		r = __delete_device(pmd, dev);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1058,28 +1138,40 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
 					uint64_t current_id,
 					uint64_t current_id,
 					uint64_t new_id)
 					uint64_t new_id)
 {
 {
+	int r = -EINVAL;
+
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
+
+	if (pmd->fail_io)
+		goto out;
+
 	if (pmd->trans_id != current_id) {
 	if (pmd->trans_id != current_id) {
-		up_write(&pmd->root_lock);
 		DMERR("mismatched transaction id");
 		DMERR("mismatched transaction id");
-		return -EINVAL;
+		goto out;
 	}
 	}
 
 
 	pmd->trans_id = new_id;
 	pmd->trans_id = new_id;
-	pmd->need_commit = 1;
+	r = 0;
+
+out:
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
-	return 0;
+	return r;
 }
 }
 
 
 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
 					uint64_t *result)
 					uint64_t *result)
 {
 {
+	int r = -EINVAL;
+
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	*result = pmd->trans_id;
+	if (!pmd->fail_io) {
+		*result = pmd->trans_id;
+		r = 0;
+	}
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
-	return 0;
+	return r;
 }
 }
 
 
 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
@@ -1108,8 +1200,6 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
 
 
 		dm_tm_dec(pmd->tm, held_root);
 		dm_tm_dec(pmd->tm, held_root);
 		dm_tm_unlock(pmd->tm, copy);
 		dm_tm_unlock(pmd->tm, copy);
-		pmd->need_commit = 1;
-
 		return -EBUSY;
 		return -EBUSY;
 	}
 	}
 
 
@@ -1131,29 +1221,25 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
 	/*
 	/*
 	 * Write the held root into the superblock.
 	 * Write the held root into the superblock.
 	 */
 	 */
-	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-			     &sb_validator, &sblock);
+	r = superblock_lock(pmd, &sblock);
 	if (r) {
 	if (r) {
 		dm_tm_dec(pmd->tm, held_root);
 		dm_tm_dec(pmd->tm, held_root);
-		pmd->need_commit = 1;
 		return r;
 		return r;
 	}
 	}
 
 
 	disk_super = dm_block_data(sblock);
 	disk_super = dm_block_data(sblock);
 	disk_super->held_root = cpu_to_le64(held_root);
 	disk_super->held_root = cpu_to_le64(held_root);
 	dm_bm_unlock(sblock);
 	dm_bm_unlock(sblock);
-
-	pmd->need_commit = 1;
-
 	return 0;
 	return 0;
 }
 }
 
 
 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __reserve_metadata_snap(pmd);
+	if (!pmd->fail_io)
+		r = __reserve_metadata_snap(pmd);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1166,15 +1252,13 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
 	struct dm_block *sblock, *copy;
 	struct dm_block *sblock, *copy;
 	dm_block_t held_root;
 	dm_block_t held_root;
 
 
-	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-			     &sb_validator, &sblock);
+	r = superblock_lock(pmd, &sblock);
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
 	disk_super = dm_block_data(sblock);
 	disk_super = dm_block_data(sblock);
 	held_root = le64_to_cpu(disk_super->held_root);
 	held_root = le64_to_cpu(disk_super->held_root);
 	disk_super->held_root = cpu_to_le64(0);
 	disk_super->held_root = cpu_to_le64(0);
-	pmd->need_commit = 1;
 
 
 	dm_bm_unlock(sblock);
 	dm_bm_unlock(sblock);
 
 
@@ -1197,10 +1281,11 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
 
 
 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __release_metadata_snap(pmd);
+	if (!pmd->fail_io)
+		r = __release_metadata_snap(pmd);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1227,10 +1312,11 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd,
 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
 			      dm_block_t *result)
 			      dm_block_t *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	r = __get_metadata_snap(pmd, result);
+	if (!pmd->fail_io)
+		r = __get_metadata_snap(pmd, result);
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1239,10 +1325,11 @@ int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
 			     struct dm_thin_device **td)
 			     struct dm_thin_device **td)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __open_device(pmd, dev, 0, td);
+	if (!pmd->fail_io)
+		r = __open_device(pmd, dev, 0, td);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1262,7 +1349,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
 	return td->id;
 	return td->id;
 }
 }
 
 
-static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
+static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 {
 {
 	return td->snapshotted_time > time;
 	return td->snapshotted_time > time;
 }
 }
@@ -1270,28 +1357,31 @@ static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 		       int can_block, struct dm_thin_lookup_result *result)
 		       int can_block, struct dm_thin_lookup_result *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 	uint64_t block_time = 0;
 	uint64_t block_time = 0;
 	__le64 value;
 	__le64 value;
 	struct dm_pool_metadata *pmd = td->pmd;
 	struct dm_pool_metadata *pmd = td->pmd;
 	dm_block_t keys[2] = { td->id, block };
 	dm_block_t keys[2] = { td->id, block };
+	struct dm_btree_info *info;
 
 
 	if (can_block) {
 	if (can_block) {
 		down_read(&pmd->root_lock);
 		down_read(&pmd->root_lock);
-		r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value);
-		if (!r)
-			block_time = le64_to_cpu(value);
-		up_read(&pmd->root_lock);
-
-	} else if (down_read_trylock(&pmd->root_lock)) {
-		r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
-		if (!r)
-			block_time = le64_to_cpu(value);
-		up_read(&pmd->root_lock);
-
-	} else
+		info = &pmd->info;
+	} else if (down_read_trylock(&pmd->root_lock))
+		info = &pmd->nb_info;
+	else
 		return -EWOULDBLOCK;
 		return -EWOULDBLOCK;
 
 
+	if (pmd->fail_io)
+		goto out;
+
+	r = dm_btree_lookup(info, pmd->root, keys, &value);
+	if (!r)
+		block_time = le64_to_cpu(value);
+
+out:
+	up_read(&pmd->root_lock);
+
 	if (!r) {
 	if (!r) {
 		dm_block_t exception_block;
 		dm_block_t exception_block;
 		uint32_t exception_time;
 		uint32_t exception_time;
@@ -1312,7 +1402,6 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
 	struct dm_pool_metadata *pmd = td->pmd;
 	struct dm_pool_metadata *pmd = td->pmd;
 	dm_block_t keys[2] = { td->id, block };
 	dm_block_t keys[2] = { td->id, block };
 
 
-	pmd->need_commit = 1;
 	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
 	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
 	__dm_bless_for_disk(&value);
 	__dm_bless_for_disk(&value);
 
 
@@ -1321,10 +1410,9 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
-	if (inserted) {
+	td->changed = 1;
+	if (inserted)
 		td->mapped_blocks++;
 		td->mapped_blocks++;
-		td->changed = 1;
-	}
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1332,10 +1420,11 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 			 dm_block_t data_block)
 			 dm_block_t data_block)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&td->pmd->root_lock);
 	down_write(&td->pmd->root_lock);
-	r = __insert(td, block, data_block);
+	if (!td->pmd->fail_io)
+		r = __insert(td, block, data_block);
 	up_write(&td->pmd->root_lock);
 	up_write(&td->pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1353,31 +1442,51 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
 
 
 	td->mapped_blocks--;
 	td->mapped_blocks--;
 	td->changed = 1;
 	td->changed = 1;
-	pmd->need_commit = 1;
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&td->pmd->root_lock);
 	down_write(&td->pmd->root_lock);
-	r = __remove(td, block);
+	if (!td->pmd->fail_io)
+		r = __remove(td, block);
 	up_write(&td->pmd->root_lock);
 	up_write(&td->pmd->root_lock);
 
 
 	return r;
 	return r;
 }
 }
 
 
-int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
+bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 {
 {
 	int r;
 	int r;
 
 
-	down_write(&pmd->root_lock);
+	down_read(&td->pmd->root_lock);
+	r = td->changed;
+	up_read(&td->pmd->root_lock);
 
 
-	r = dm_sm_new_block(pmd->data_sm, result);
-	pmd->need_commit = 1;
+	return r;
+}
+
+bool dm_thin_aborted_changes(struct dm_thin_device *td)
+{
+	bool r;
 
 
+	down_read(&td->pmd->root_lock);
+	r = td->aborted_with_changes;
+	up_read(&td->pmd->root_lock);
+
+	return r;
+}
+
+int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
+{
+	int r = -EINVAL;
+
+	down_write(&pmd->root_lock);
+	if (!pmd->fail_io)
+		r = dm_sm_new_block(pmd->data_sm, result);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1385,9 +1494,11 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
 
 
 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
+	if (pmd->fail_io)
+		goto out;
 
 
 	r = __commit_transaction(pmd);
 	r = __commit_transaction(pmd);
 	if (r <= 0)
 	if (r <= 0)
@@ -1402,12 +1513,41 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
+{
+	struct dm_thin_device *td;
+
+	list_for_each_entry(td, &pmd->thin_devices, list)
+		td->aborted_with_changes = td->changed;
+}
+
+int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
+{
+	int r = -EINVAL;
+
+	down_write(&pmd->root_lock);
+	if (pmd->fail_io)
+		goto out;
+
+	__set_abort_with_changes_flags(pmd);
+	__destroy_persistent_data_objects(pmd);
+	r = __create_persistent_data_objects(pmd, false);
+	if (r)
+		pmd->fail_io = true;
+
+out:
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_free(pmd->data_sm, result);
+	if (!pmd->fail_io)
+		r = dm_sm_get_nr_free(pmd->data_sm, result);
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1416,10 +1556,11 @@ int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *resul
 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
 					  dm_block_t *result)
 					  dm_block_t *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_free(pmd->metadata_sm, result);
+	if (!pmd->fail_io)
+		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1428,10 +1569,11 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
 				  dm_block_t *result)
 				  dm_block_t *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
+	if (!pmd->fail_io)
+		r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1448,10 +1590,11 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
 
 
 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_blocks(pmd->data_sm, result);
+	if (!pmd->fail_io)
+		r = dm_sm_get_nr_blocks(pmd->data_sm, result);
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1459,13 +1602,17 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
 
 
 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
 {
 {
+	int r = -EINVAL;
 	struct dm_pool_metadata *pmd = td->pmd;
 	struct dm_pool_metadata *pmd = td->pmd;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	*result = td->mapped_blocks;
+	if (!pmd->fail_io) {
+		*result = td->mapped_blocks;
+		r = 0;
+	}
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
-	return 0;
+	return r;
 }
 }
 
 
 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
@@ -1487,11 +1634,12 @@ static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
 				     dm_block_t *result)
 				     dm_block_t *result)
 {
 {
-	int r;
+	int r = -EINVAL;
 	struct dm_pool_metadata *pmd = td->pmd;
 	struct dm_pool_metadata *pmd = td->pmd;
 
 
 	down_read(&pmd->root_lock);
 	down_read(&pmd->root_lock);
-	r = __highest_block(td, result);
+	if (!pmd->fail_io)
+		r = __highest_block(td, result);
 	up_read(&pmd->root_lock);
 	up_read(&pmd->root_lock);
 
 
 	return r;
 	return r;
@@ -1514,20 +1662,25 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	r = dm_sm_extend(pmd->data_sm, new_count - old_count);
-	if (!r)
-		pmd->need_commit = 1;
-
-	return r;
+	return dm_sm_extend(pmd->data_sm, new_count - old_count);
 }
 }
 
 
 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 {
 {
-	int r;
+	int r = -EINVAL;
 
 
 	down_write(&pmd->root_lock);
 	down_write(&pmd->root_lock);
-	r = __resize_data_dev(pmd, new_count);
+	if (!pmd->fail_io)
+		r = __resize_data_dev(pmd, new_count);
 	up_write(&pmd->root_lock);
 	up_write(&pmd->root_lock);
 
 
 	return r;
 	return r;
 }
 }
+
+void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
+{
+	down_write(&pmd->root_lock);
+	pmd->read_only = true;
+	dm_bm_set_read_only(pmd->bm);
+	up_write(&pmd->root_lock);
+}

+ 23 - 2
drivers/md/dm-thin-metadata.h

@@ -38,7 +38,8 @@ typedef uint64_t dm_thin_id;
  * Reopens or creates a new, empty metadata volume.
  * Reopens or creates a new, empty metadata volume.
  */
  */
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
-					       sector_t data_block_size);
+					       sector_t data_block_size,
+					       bool format_device);
 
 
 int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
 int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
 
 
@@ -78,6 +79,16 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
  */
  */
 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
 
 
+/*
+ * Discards all uncommitted changes.  Rereads the superblock, rolling back
+ * to the last good transaction.  Thin devices remain open.
+ * dm_thin_aborted_changes() tells you if they had uncommitted changes.
+ *
+ * If this call fails it's only useful to call dm_pool_metadata_close().
+ * All other methods will fail with -EINVAL.
+ */
+int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
+
 /*
 /*
  * Set/get userspace transaction id.
  * Set/get userspace transaction id.
  */
  */
@@ -119,7 +130,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
 
 
 struct dm_thin_lookup_result {
 struct dm_thin_lookup_result {
 	dm_block_t block;
 	dm_block_t block;
-	int shared;
+	unsigned shared:1;
 };
 };
 
 
 /*
 /*
@@ -147,6 +158,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
 /*
 /*
  * Queries.
  * Queries.
  */
  */
+bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
+
+bool dm_thin_aborted_changes(struct dm_thin_device *td);
+
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
 				     dm_block_t *highest_mapped);
 				     dm_block_t *highest_mapped);
 
 
@@ -171,6 +186,12 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
  */
  */
 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
 
 
+/*
+ * Flicks the underlying block manager into read only mode, so you know
+ * that nothing is changing.
+ */
+void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/
 
 
 #endif
 #endif

+ 405 - 137
drivers/md/dm-thin.c

@@ -1,10 +1,11 @@
 /*
 /*
- * Copyright (C) 2011 Red Hat UK.
+ * Copyright (C) 2011-2012 Red Hat UK.
  *
  *
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
 #include "dm-thin-metadata.h"
 #include "dm-thin-metadata.h"
+#include "dm.h"
 
 
 #include <linux/device-mapper.h>
 #include <linux/device-mapper.h>
 #include <linux/dm-io.h>
 #include <linux/dm-io.h>
@@ -19,7 +20,7 @@
 /*
 /*
  * Tunable constants
  * Tunable constants
  */
  */
-#define ENDIO_HOOK_POOL_SIZE 10240
+#define ENDIO_HOOK_POOL_SIZE 1024
 #define DEFERRED_SET_SIZE 64
 #define DEFERRED_SET_SIZE 64
 #define MAPPING_POOL_SIZE 1024
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
 #define PRISON_CELLS 1024
@@ -496,12 +497,27 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
  */
  */
 struct dm_thin_new_mapping;
 struct dm_thin_new_mapping;
 
 
+/*
+ * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ */
+enum pool_mode {
+	PM_WRITE,		/* metadata may be changed */
+	PM_READ_ONLY,		/* metadata may not be changed */
+	PM_FAIL,		/* all I/O fails */
+};
+
 struct pool_features {
 struct pool_features {
+	enum pool_mode mode;
+
 	unsigned zero_new_blocks:1;
 	unsigned zero_new_blocks:1;
 	unsigned discard_enabled:1;
 	unsigned discard_enabled:1;
 	unsigned discard_passdown:1;
 	unsigned discard_passdown:1;
 };
 };
 
 
+struct thin_c;
+typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+
 struct pool {
 struct pool {
 	struct list_head list;
 	struct list_head list;
 	struct dm_target *ti;	/* Only set if a pool target is bound */
 	struct dm_target *ti;	/* Only set if a pool target is bound */
@@ -510,10 +526,9 @@ struct pool {
 	struct block_device *md_dev;
 	struct block_device *md_dev;
 	struct dm_pool_metadata *pmd;
 	struct dm_pool_metadata *pmd;
 
 
-	uint32_t sectors_per_block;
-	unsigned block_shift;
-	dm_block_t offset_mask;
 	dm_block_t low_water_blocks;
 	dm_block_t low_water_blocks;
+	uint32_t sectors_per_block;
+	int sectors_per_block_shift;
 
 
 	struct pool_features pf;
 	struct pool_features pf;
 	unsigned low_water_triggered:1;	/* A dm event has been sent */
 	unsigned low_water_triggered:1;	/* A dm event has been sent */
@@ -526,8 +541,8 @@ struct pool {
 	struct work_struct worker;
 	struct work_struct worker;
 	struct delayed_work waker;
 	struct delayed_work waker;
 
 
-	unsigned ref_count;
 	unsigned long last_commit_jiffies;
 	unsigned long last_commit_jiffies;
+	unsigned ref_count;
 
 
 	spinlock_t lock;
 	spinlock_t lock;
 	struct bio_list deferred_bios;
 	struct bio_list deferred_bios;
@@ -543,8 +558,17 @@ struct pool {
 	struct dm_thin_new_mapping *next_mapping;
 	struct dm_thin_new_mapping *next_mapping;
 	mempool_t *mapping_pool;
 	mempool_t *mapping_pool;
 	mempool_t *endio_hook_pool;
 	mempool_t *endio_hook_pool;
+
+	process_bio_fn process_bio;
+	process_bio_fn process_discard;
+
+	process_mapping_fn process_prepared_mapping;
+	process_mapping_fn process_prepared_discard;
 };
 };
 
 
+static enum pool_mode get_pool_mode(struct pool *pool);
+static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+
 /*
 /*
  * Target context for a pool.
  * Target context for a pool.
  */
  */
@@ -679,16 +703,28 @@ static void requeue_io(struct thin_c *tc)
 
 
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
 {
-	return bio->bi_sector >> tc->pool->block_shift;
+	sector_t block_nr = bio->bi_sector;
+
+	if (tc->pool->sectors_per_block_shift < 0)
+		(void) sector_div(block_nr, tc->pool->sectors_per_block);
+	else
+		block_nr >>= tc->pool->sectors_per_block_shift;
+
+	return block_nr;
 }
 }
 
 
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
 {
 	struct pool *pool = tc->pool;
 	struct pool *pool = tc->pool;
+	sector_t bi_sector = bio->bi_sector;
 
 
 	bio->bi_bdev = tc->pool_dev->bdev;
 	bio->bi_bdev = tc->pool_dev->bdev;
-	bio->bi_sector = (block << pool->block_shift) +
-		(bio->bi_sector & pool->offset_mask);
+	if (tc->pool->sectors_per_block_shift < 0)
+		bio->bi_sector = (block * pool->sectors_per_block) +
+				 sector_div(bi_sector, pool->sectors_per_block);
+	else
+		bio->bi_sector = (block << pool->sectors_per_block_shift) |
+				(bi_sector & (pool->sectors_per_block - 1));
 }
 }
 
 
 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -696,21 +732,39 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 	bio->bi_bdev = tc->origin_dev->bdev;
 	bio->bi_bdev = tc->origin_dev->bdev;
 }
 }
 
 
+static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
+{
+	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+		dm_thin_changed_this_transaction(tc->td);
+}
+
 static void issue(struct thin_c *tc, struct bio *bio)
 static void issue(struct thin_c *tc, struct bio *bio)
 {
 {
 	struct pool *pool = tc->pool;
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	unsigned long flags;
 
 
+	if (!bio_triggers_commit(tc, bio)) {
+		generic_make_request(bio);
+		return;
+	}
+
 	/*
 	/*
-	 * Batch together any FUA/FLUSH bios we find and then issue
-	 * a single commit for them in process_deferred_bios().
+	 * Complete bio with an error if earlier I/O caused changes to
+	 * the metadata that can't be committed e.g, due to I/O errors
+	 * on the metadata device.
 	 */
 	 */
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-		spin_lock_irqsave(&pool->lock, flags);
-		bio_list_add(&pool->deferred_flush_bios, bio);
-		spin_unlock_irqrestore(&pool->lock, flags);
-	} else
-		generic_make_request(bio);
+	if (dm_thin_aborted_changes(tc->td)) {
+		bio_io_error(bio);
+		return;
+	}
+
+	/*
+	 * Batch together any bios that trigger commits and then issue a
+	 * single commit for them in process_deferred_bios().
+	 */
+	spin_lock_irqsave(&pool->lock, flags);
+	bio_list_add(&pool->deferred_flush_bios, bio);
+	spin_unlock_irqrestore(&pool->lock, flags);
 }
 }
 
 
 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -847,6 +901,14 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell
 	wake_worker(pool);
 	wake_worker(pool);
 }
 }
 
 
+static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
+{
+	if (m->bio)
+		m->bio->bi_end_io = m->saved_bi_end_io;
+	cell_error(m->cell);
+	list_del(&m->list);
+	mempool_free(m, m->tc->pool->mapping_pool);
+}
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 {
 	struct thin_c *tc = m->tc;
 	struct thin_c *tc = m->tc;
@@ -859,7 +921,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
 
 	if (m->err) {
 	if (m->err) {
 		cell_error(m->cell);
 		cell_error(m->cell);
-		return;
+		goto out;
 	}
 	}
 
 
 	/*
 	/*
@@ -871,7 +933,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	if (r) {
 	if (r) {
 		DMERR("dm_thin_insert_block() failed");
 		DMERR("dm_thin_insert_block() failed");
 		cell_error(m->cell);
 		cell_error(m->cell);
-		return;
+		goto out;
 	}
 	}
 
 
 	/*
 	/*
@@ -886,22 +948,25 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	} else
 	} else
 		cell_defer(tc, m->cell, m->data_block);
 		cell_defer(tc, m->cell, m->data_block);
 
 
+out:
 	list_del(&m->list);
 	list_del(&m->list);
 	mempool_free(m, tc->pool->mapping_pool);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 }
 
 
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 {
 {
-	int r;
 	struct thin_c *tc = m->tc;
 	struct thin_c *tc = m->tc;
 
 
-	r = dm_thin_remove_block(tc->td, m->virt_block);
-	if (r)
-		DMERR("dm_thin_remove_block() failed");
+	bio_io_error(m->bio);
+	cell_defer_except(tc, m->cell);
+	cell_defer_except(tc, m->cell2);
+	mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+{
+	struct thin_c *tc = m->tc;
 
 
-	/*
-	 * Pass the discard down to the underlying device?
-	 */
 	if (m->pass_discard)
 	if (m->pass_discard)
 		remap_and_issue(tc, m->bio, m->data_block);
 		remap_and_issue(tc, m->bio, m->data_block);
 	else
 	else
@@ -912,8 +977,20 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m)
 	mempool_free(m, tc->pool->mapping_pool);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 }
 
 
+static void process_prepared_discard(struct dm_thin_new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+
+	r = dm_thin_remove_block(tc->td, m->virt_block);
+	if (r)
+		DMERR("dm_thin_remove_block() failed");
+
+	process_prepared_discard_passdown(m);
+}
+
 static void process_prepared(struct pool *pool, struct list_head *head,
 static void process_prepared(struct pool *pool, struct list_head *head,
-			     void (*fn)(struct dm_thin_new_mapping *))
+			     process_mapping_fn *fn)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 	struct list_head maps;
 	struct list_head maps;
@@ -925,7 +1002,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
 	spin_unlock_irqrestore(&pool->lock, flags);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 
 	list_for_each_entry_safe(m, tmp, &maps, list)
 	list_for_each_entry_safe(m, tmp, &maps, list)
-		fn(m);
+		(*fn)(m);
 }
 }
 
 
 /*
 /*
@@ -933,9 +1010,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
  */
  */
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
 {
-	return !(bio->bi_sector & pool->offset_mask) &&
-		(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
-
+	return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
 }
 }
 
 
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -1093,6 +1168,35 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	}
 	}
 }
 }
 
 
+static int commit(struct pool *pool)
+{
+	int r;
+
+	r = dm_pool_commit_metadata(pool->pmd);
+	if (r)
+		DMERR("commit failed, error = %d", r);
+
+	return r;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ * Many callers don't care about the return value.
+ */
+static int commit_or_fallback(struct pool *pool)
+{
+	int r;
+
+	if (get_pool_mode(pool) != PM_WRITE)
+		return -EINVAL;
+
+	r = commit(pool);
+	if (r)
+		set_pool_mode(pool, PM_READ_ONLY);
+
+	return r;
+}
+
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 {
 	int r;
 	int r;
@@ -1121,12 +1225,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 			 * Try to commit to see if that will free up some
 			 * Try to commit to see if that will free up some
 			 * more space.
 			 * more space.
 			 */
 			 */
-			r = dm_pool_commit_metadata(pool->pmd);
-			if (r) {
-				DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-				      __func__, r);
-				return r;
-			}
+			(void) commit_or_fallback(pool);
 
 
 			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 			if (r)
 			if (r)
@@ -1218,7 +1317,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			 */
 			 */
 			m = get_next_mapping(pool);
 			m = get_next_mapping(pool);
 			m->tc = tc;
 			m->tc = tc;
-			m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
+			m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
 			m->virt_block = block;
 			m->virt_block = block;
 			m->data_block = lookup_result.block;
 			m->data_block = lookup_result.block;
 			m->cell = cell;
 			m->cell = cell;
@@ -1234,15 +1333,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			}
 			}
 		} else {
 		} else {
 			/*
 			/*
-			 * This path is hit if people are ignoring
-			 * limits->discard_granularity.  It ignores any
-			 * part of the discard that is in a subsequent
-			 * block.
+			 * The DM core makes sure that the discard doesn't span
+			 * a block boundary.  So we submit the discard of a
+			 * partial block appropriately.
 			 */
 			 */
-			sector_t offset = bio->bi_sector - (block << pool->block_shift);
-			unsigned remaining = (pool->sectors_per_block - offset) << 9;
-			bio->bi_size = min(bio->bi_size, remaining);
-
 			cell_release_singleton(cell, bio);
 			cell_release_singleton(cell, bio);
 			cell_release_singleton(cell2, bio);
 			cell_release_singleton(cell2, bio);
 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
@@ -1310,7 +1404,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	if (bio_detain(pool->prison, &key, bio, &cell))
 	if (bio_detain(pool->prison, &key, bio, &cell))
 		return;
 		return;
 
 
-	if (bio_data_dir(bio) == WRITE)
+	if (bio_data_dir(bio) == WRITE && bio->bi_size)
 		break_sharing(tc, bio, block, &key, lookup_result, cell);
 		break_sharing(tc, bio, block, &key, lookup_result, cell);
 	else {
 	else {
 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
@@ -1362,6 +1456,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 
 
 	default:
 	default:
 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
+		set_pool_mode(tc->pool, PM_READ_ONLY);
 		cell_error(cell);
 		cell_error(cell);
 		break;
 		break;
 	}
 	}
@@ -1419,6 +1514,49 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	}
 	}
 }
 }
 
 
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+	int r;
+	int rw = bio_data_dir(bio);
+	dm_block_t block = get_bio_block(tc, bio);
+	struct dm_thin_lookup_result lookup_result;
+
+	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+	switch (r) {
+	case 0:
+		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+			bio_io_error(bio);
+		else
+			remap_and_issue(tc, bio, lookup_result.block);
+		break;
+
+	case -ENODATA:
+		if (rw != READ) {
+			bio_io_error(bio);
+			break;
+		}
+
+		if (tc->origin_dev) {
+			remap_to_origin_and_issue(tc, bio);
+			break;
+		}
+
+		zero_fill_bio(bio);
+		bio_endio(bio, 0);
+		break;
+
+	default:
+		DMERR("dm_thin_find_block() failed, error = %d", r);
+		bio_io_error(bio);
+		break;
+	}
+}
+
+static void process_bio_fail(struct thin_c *tc, struct bio *bio)
+{
+	bio_io_error(bio);
+}
+
 static int need_commit_due_to_time(struct pool *pool)
 static int need_commit_due_to_time(struct pool *pool)
 {
 {
 	return jiffies < pool->last_commit_jiffies ||
 	return jiffies < pool->last_commit_jiffies ||
@@ -1430,7 +1568,6 @@ static void process_deferred_bios(struct pool *pool)
 	unsigned long flags;
 	unsigned long flags;
 	struct bio *bio;
 	struct bio *bio;
 	struct bio_list bios;
 	struct bio_list bios;
-	int r;
 
 
 	bio_list_init(&bios);
 	bio_list_init(&bios);
 
 
@@ -1457,9 +1594,9 @@ static void process_deferred_bios(struct pool *pool)
 		}
 		}
 
 
 		if (bio->bi_rw & REQ_DISCARD)
 		if (bio->bi_rw & REQ_DISCARD)
-			process_discard(tc, bio);
+			pool->process_discard(tc, bio);
 		else
 		else
-			process_bio(tc, bio);
+			pool->process_bio(tc, bio);
 	}
 	}
 
 
 	/*
 	/*
@@ -1475,10 +1612,7 @@ static void process_deferred_bios(struct pool *pool)
 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
 		return;
 		return;
 
 
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
+	if (commit_or_fallback(pool)) {
 		while ((bio = bio_list_pop(&bios)))
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 			bio_io_error(bio);
 		return;
 		return;
@@ -1493,8 +1627,8 @@ static void do_worker(struct work_struct *ws)
 {
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 	struct pool *pool = container_of(ws, struct pool, worker);
 
 
-	process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
-	process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
+	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	process_deferred_bios(pool);
 	process_deferred_bios(pool);
 }
 }
 
 
@@ -1511,6 +1645,52 @@ static void do_waker(struct work_struct *ws)
 
 
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/
 
 
+static enum pool_mode get_pool_mode(struct pool *pool)
+{
+	return pool->pf.mode;
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+{
+	int r;
+
+	pool->pf.mode = mode;
+
+	switch (mode) {
+	case PM_FAIL:
+		DMERR("switching pool to failure mode");
+		pool->process_bio = process_bio_fail;
+		pool->process_discard = process_bio_fail;
+		pool->process_prepared_mapping = process_prepared_mapping_fail;
+		pool->process_prepared_discard = process_prepared_discard_fail;
+		break;
+
+	case PM_READ_ONLY:
+		DMERR("switching pool to read-only mode");
+		r = dm_pool_abort_metadata(pool->pmd);
+		if (r) {
+			DMERR("aborting transaction failed");
+			set_pool_mode(pool, PM_FAIL);
+		} else {
+			dm_pool_metadata_read_only(pool->pmd);
+			pool->process_bio = process_bio_read_only;
+			pool->process_discard = process_discard;
+			pool->process_prepared_mapping = process_prepared_mapping_fail;
+			pool->process_prepared_discard = process_prepared_discard_passdown;
+		}
+		break;
+
+	case PM_WRITE:
+		pool->process_bio = process_bio;
+		pool->process_discard = process_discard;
+		pool->process_prepared_mapping = process_prepared_mapping;
+		pool->process_prepared_discard = process_prepared_discard;
+		break;
+	}
+}
+
+/*----------------------------------------------------------------*/
+
 /*
 /*
  * Mapping functions.
  * Mapping functions.
  */
  */
@@ -1556,6 +1736,12 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 	struct dm_thin_lookup_result result;
 	struct dm_thin_lookup_result result;
 
 
 	map_context->ptr = thin_hook_bio(tc, bio);
 	map_context->ptr = thin_hook_bio(tc, bio);
+
+	if (get_pool_mode(tc->pool) == PM_FAIL) {
+		bio_io_error(bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
 		thin_defer_bio(tc, bio);
 		thin_defer_bio(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 		return DM_MAPIO_SUBMITTED;
@@ -1592,14 +1778,35 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 		break;
 		break;
 
 
 	case -ENODATA:
 	case -ENODATA:
+		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
+			/*
+			 * This block isn't provisioned, and we have no way
+			 * of doing so.  Just error it.
+			 */
+			bio_io_error(bio);
+			r = DM_MAPIO_SUBMITTED;
+			break;
+		}
+		/* fall through */
+
+	case -EWOULDBLOCK:
 		/*
 		/*
 		 * In future, the failed dm_thin_find_block above could
 		 * In future, the failed dm_thin_find_block above could
 		 * provide the hint to load the metadata into cache.
 		 * provide the hint to load the metadata into cache.
 		 */
 		 */
-	case -EWOULDBLOCK:
 		thin_defer_bio(tc, bio);
 		thin_defer_bio(tc, bio);
 		r = DM_MAPIO_SUBMITTED;
 		r = DM_MAPIO_SUBMITTED;
 		break;
 		break;
+
+	default:
+		/*
+		 * Must always call bio_io_error on failure.
+		 * dm_thin_find_block can fail with -EINVAL if the
+		 * pool is switched to fail-io mode.
+		 */
+		bio_io_error(bio);
+		r = DM_MAPIO_SUBMITTED;
+		break;
 	}
 	}
 
 
 	return r;
 	return r;
@@ -1636,15 +1843,26 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 {
 {
 	struct pool_c *pt = ti->private;
 	struct pool_c *pt = ti->private;
 
 
+	/*
+	 * We want to make sure that degraded pools are never upgraded.
+	 */
+	enum pool_mode old_mode = pool->pf.mode;
+	enum pool_mode new_mode = pt->pf.mode;
+
+	if (old_mode > new_mode)
+		new_mode = old_mode;
+
 	pool->ti = ti;
 	pool->ti = ti;
 	pool->low_water_blocks = pt->low_water_blocks;
 	pool->low_water_blocks = pt->low_water_blocks;
 	pool->pf = pt->pf;
 	pool->pf = pt->pf;
+	set_pool_mode(pool, new_mode);
 
 
 	/*
 	/*
 	 * If discard_passdown was enabled verify that the data device
 	 * If discard_passdown was enabled verify that the data device
 	 * supports discards.  Disable discard_passdown if not; otherwise
 	 * supports discards.  Disable discard_passdown if not; otherwise
 	 * -EOPNOTSUPP will be returned.
 	 * -EOPNOTSUPP will be returned.
 	 */
 	 */
+	/* FIXME: pull this out into a sep fn. */
 	if (pt->pf.discard_passdown) {
 	if (pt->pf.discard_passdown) {
 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
 		if (!q || !blk_queue_discard(q)) {
 		if (!q || !blk_queue_discard(q)) {
@@ -1670,6 +1888,7 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 /* Initialize pool features. */
 /* Initialize pool features. */
 static void pool_features_init(struct pool_features *pf)
 static void pool_features_init(struct pool_features *pf)
 {
 {
+	pf->mode = PM_WRITE;
 	pf->zero_new_blocks = 1;
 	pf->zero_new_blocks = 1;
 	pf->discard_enabled = 1;
 	pf->discard_enabled = 1;
 	pf->discard_passdown = 1;
 	pf->discard_passdown = 1;
@@ -1700,14 +1919,16 @@ static struct kmem_cache *_endio_hook_cache;
 
 
 static struct pool *pool_create(struct mapped_device *pool_md,
 static struct pool *pool_create(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
 				struct block_device *metadata_dev,
-				unsigned long block_size, char **error)
+				unsigned long block_size,
+				int read_only, char **error)
 {
 {
 	int r;
 	int r;
 	void *err_p;
 	void *err_p;
 	struct pool *pool;
 	struct pool *pool;
 	struct dm_pool_metadata *pmd;
 	struct dm_pool_metadata *pmd;
+	bool format_device = read_only ? false : true;
 
 
-	pmd = dm_pool_metadata_open(metadata_dev, block_size);
+	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
 	if (IS_ERR(pmd)) {
 	if (IS_ERR(pmd)) {
 		*error = "Error creating metadata object";
 		*error = "Error creating metadata object";
 		return (struct pool *)pmd;
 		return (struct pool *)pmd;
@@ -1722,8 +1943,10 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
 
 	pool->pmd = pmd;
 	pool->pmd = pmd;
 	pool->sectors_per_block = block_size;
 	pool->sectors_per_block = block_size;
-	pool->block_shift = ffs(block_size) - 1;
-	pool->offset_mask = block_size - 1;
+	if (block_size & (block_size - 1))
+		pool->sectors_per_block_shift = -1;
+	else
+		pool->sectors_per_block_shift = __ffs(block_size);
 	pool->low_water_blocks = 0;
 	pool->low_water_blocks = 0;
 	pool_features_init(&pool->pf);
 	pool_features_init(&pool->pf);
 	pool->prison = prison_create(PRISON_CELLS);
 	pool->prison = prison_create(PRISON_CELLS);
@@ -1822,25 +2045,29 @@ static void __pool_dec(struct pool *pool)
 
 
 static struct pool *__pool_find(struct mapped_device *pool_md,
 static struct pool *__pool_find(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
 				struct block_device *metadata_dev,
-				unsigned long block_size, char **error,
-				int *created)
+				unsigned long block_size, int read_only,
+				char **error, int *created)
 {
 {
 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 
 
 	if (pool) {
 	if (pool) {
-		if (pool->pool_md != pool_md)
+		if (pool->pool_md != pool_md) {
+			*error = "metadata device already in use by a pool";
 			return ERR_PTR(-EBUSY);
 			return ERR_PTR(-EBUSY);
+		}
 		__pool_inc(pool);
 		__pool_inc(pool);
 
 
 	} else {
 	} else {
 		pool = __pool_table_lookup(pool_md);
 		pool = __pool_table_lookup(pool_md);
 		if (pool) {
 		if (pool) {
-			if (pool->md_dev != metadata_dev)
+			if (pool->md_dev != metadata_dev) {
+				*error = "different pool cannot replace a pool";
 				return ERR_PTR(-EINVAL);
 				return ERR_PTR(-EINVAL);
+			}
 			__pool_inc(pool);
 			__pool_inc(pool);
 
 
 		} else {
 		} else {
-			pool = pool_create(pool_md, metadata_dev, block_size, error);
+			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
 			*created = 1;
 			*created = 1;
 		}
 		}
 	}
 	}
@@ -1891,19 +2118,23 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 		arg_name = dm_shift_arg(as);
 		arg_name = dm_shift_arg(as);
 		argc--;
 		argc--;
 
 
-		if (!strcasecmp(arg_name, "skip_block_zeroing")) {
+		if (!strcasecmp(arg_name, "skip_block_zeroing"))
 			pf->zero_new_blocks = 0;
 			pf->zero_new_blocks = 0;
-			continue;
-		} else if (!strcasecmp(arg_name, "ignore_discard")) {
+
+		else if (!strcasecmp(arg_name, "ignore_discard"))
 			pf->discard_enabled = 0;
 			pf->discard_enabled = 0;
-			continue;
-		} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+
+		else if (!strcasecmp(arg_name, "no_discard_passdown"))
 			pf->discard_passdown = 0;
 			pf->discard_passdown = 0;
-			continue;
-		}
 
 
-		ti->error = "Unrecognised pool feature requested";
-		r = -EINVAL;
+		else if (!strcasecmp(arg_name, "read_only"))
+			pf->mode = PM_READ_ONLY;
+
+		else {
+			ti->error = "Unrecognised pool feature requested";
+			r = -EINVAL;
+			break;
+		}
 	}
 	}
 
 
 	return r;
 	return r;
@@ -1967,7 +2198,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
-	    !is_power_of_2(block_size)) {
+	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
 		ti->error = "Invalid block size";
 		ti->error = "Invalid block size";
 		r = -EINVAL;
 		r = -EINVAL;
 		goto out;
 		goto out;
@@ -1996,7 +2227,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	}
 
 
 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-			   block_size, &ti->error, &pool_created);
+			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
 	if (IS_ERR(pool)) {
 	if (IS_ERR(pool)) {
 		r = PTR_ERR(pool);
 		r = PTR_ERR(pool);
 		goto out_free_pt;
 		goto out_free_pt;
@@ -2014,6 +2245,15 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto out_flags_changed;
 		goto out_flags_changed;
 	}
 	}
 
 
+	/*
+	 * The block layer requires discard_granularity to be a power of 2.
+	 */
+	if (pf.discard_enabled && !is_power_of_2(block_size)) {
+		ti->error = "Discard support must be disabled when the block size is not a power of 2";
+		r = -EINVAL;
+		goto out_flags_changed;
+	}
+
 	pt->pool = pool;
 	pt->pool = pool;
 	pt->ti = ti;
 	pt->ti = ti;
 	pt->metadata_dev = metadata_dev;
 	pt->metadata_dev = metadata_dev;
@@ -2033,7 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		 * stacking of discard limits (this keeps the pool and
 		 * stacking of discard limits (this keeps the pool and
 		 * thin devices' discard limits consistent).
 		 * thin devices' discard limits consistent).
 		 */
 		 */
-		ti->discards_supported = 1;
+		ti->discards_supported = true;
 	}
 	}
 	ti->private = pt;
 	ti->private = pt;
 
 
@@ -2093,7 +2333,8 @@ static int pool_preresume(struct dm_target *ti)
 	int r;
 	int r;
 	struct pool_c *pt = ti->private;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	struct pool *pool = pt->pool;
-	dm_block_t data_size, sb_data_size;
+	sector_t data_size = ti->len;
+	dm_block_t sb_data_size;
 
 
 	/*
 	/*
 	 * Take control of the pool object.
 	 * Take control of the pool object.
@@ -2102,7 +2343,8 @@ static int pool_preresume(struct dm_target *ti)
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
-	data_size = ti->len >> pool->block_shift;
+	(void) sector_div(data_size, pool->sectors_per_block);
+
 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
 	if (r) {
 	if (r) {
 		DMERR("failed to retrieve data device size");
 		DMERR("failed to retrieve data device size");
@@ -2111,22 +2353,19 @@ static int pool_preresume(struct dm_target *ti)
 
 
 	if (data_size < sb_data_size) {
 	if (data_size < sb_data_size) {
 		DMERR("pool target too small, is %llu blocks (expected %llu)",
 		DMERR("pool target too small, is %llu blocks (expected %llu)",
-		      data_size, sb_data_size);
+		      (unsigned long long)data_size, sb_data_size);
 		return -EINVAL;
 		return -EINVAL;
 
 
 	} else if (data_size > sb_data_size) {
 	} else if (data_size > sb_data_size) {
 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
 		if (r) {
 		if (r) {
 			DMERR("failed to resize data device");
 			DMERR("failed to resize data device");
+			/* FIXME Stricter than necessary: Rollback transaction instead here */
+			set_pool_mode(pool, PM_READ_ONLY);
 			return r;
 			return r;
 		}
 		}
 
 
-		r = dm_pool_commit_metadata(pool->pmd);
-		if (r) {
-			DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-			      __func__, r);
-			return r;
-		}
+		(void) commit_or_fallback(pool);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -2149,19 +2388,12 @@ static void pool_resume(struct dm_target *ti)
 
 
 static void pool_postsuspend(struct dm_target *ti)
 static void pool_postsuspend(struct dm_target *ti)
 {
 {
-	int r;
 	struct pool_c *pt = ti->private;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	struct pool *pool = pt->pool;
 
 
 	cancel_delayed_work(&pool->waker);
 	cancel_delayed_work(&pool->waker);
 	flush_workqueue(pool->wq);
 	flush_workqueue(pool->wq);
-
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r < 0) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
-		/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
-	}
+	(void) commit_or_fallback(pool);
 }
 }
 
 
 static int check_arg_count(unsigned argc, unsigned args_required)
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2295,12 +2527,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
-		return r;
-	}
+	(void) commit_or_fallback(pool);
 
 
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	if (r)
 	if (r)
@@ -2361,25 +2588,41 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 	else
 	else
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 
 
-	if (!r) {
-		r = dm_pool_commit_metadata(pool->pmd);
-		if (r)
-			DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
-			      argv[0], r);
-	}
+	if (!r)
+		(void) commit_or_fallback(pool);
 
 
 	return r;
 	return r;
 }
 }
 
 
+static void emit_flags(struct pool_features *pf, char *result,
+		       unsigned sz, unsigned maxlen)
+{
+	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
+		!pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+	DMEMIT("%u ", count);
+
+	if (!pf->zero_new_blocks)
+		DMEMIT("skip_block_zeroing ");
+
+	if (!pf->discard_enabled)
+		DMEMIT("ignore_discard ");
+
+	if (!pf->discard_passdown)
+		DMEMIT("no_discard_passdown ");
+
+	if (pf->mode == PM_READ_ONLY)
+		DMEMIT("read_only ");
+}
+
 /*
 /*
  * Status line is:
  * Status line is:
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
  *    <used data sectors>/<total data sectors> <held metadata root>
  *    <used data sectors>/<total data sectors> <held metadata root>
  */
  */
 static int pool_status(struct dm_target *ti, status_type_t type,
 static int pool_status(struct dm_target *ti, status_type_t type,
-		       char *result, unsigned maxlen)
+		       unsigned status_flags, char *result, unsigned maxlen)
 {
 {
-	int r, count;
+	int r;
 	unsigned sz = 0;
 	unsigned sz = 0;
 	uint64_t transaction_id;
 	uint64_t transaction_id;
 	dm_block_t nr_free_blocks_data;
 	dm_block_t nr_free_blocks_data;
@@ -2394,6 +2637,15 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 
 
 	switch (type) {
 	switch (type) {
 	case STATUSTYPE_INFO:
 	case STATUSTYPE_INFO:
+		if (get_pool_mode(pool) == PM_FAIL) {
+			DMEMIT("Fail");
+			break;
+		}
+
+		/* Commit to ensure statistics aren't out-of-date */
+		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+			(void) commit_or_fallback(pool);
+
 		r = dm_pool_get_metadata_transaction_id(pool->pmd,
 		r = dm_pool_get_metadata_transaction_id(pool->pmd,
 							&transaction_id);
 							&transaction_id);
 		if (r)
 		if (r)
@@ -2429,9 +2681,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		       (unsigned long long)nr_blocks_data);
 		       (unsigned long long)nr_blocks_data);
 
 
 		if (held_root)
 		if (held_root)
-			DMEMIT("%llu", held_root);
+			DMEMIT("%llu ", held_root);
+		else
+			DMEMIT("- ");
+
+		if (pool->pf.mode == PM_READ_ONLY)
+			DMEMIT("ro ");
+		else
+			DMEMIT("rw ");
+
+		if (pool->pf.discard_enabled && pool->pf.discard_passdown)
+			DMEMIT("discard_passdown");
 		else
 		else
-			DMEMIT("-");
+			DMEMIT("no_discard_passdown");
 
 
 		break;
 		break;
 
 
@@ -2441,20 +2703,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long long)pt->low_water_blocks);
 		       (unsigned long long)pt->low_water_blocks);
-
-		count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
-			!pt->pf.discard_passdown;
-		DMEMIT("%u ", count);
-
-		if (!pool->pf.zero_new_blocks)
-			DMEMIT("skip_block_zeroing ");
-
-		if (!pool->pf.discard_enabled)
-			DMEMIT("ignore_discard ");
-
-		if (!pt->pf.discard_passdown)
-			DMEMIT("no_discard_passdown ");
-
+		emit_flags(&pt->pf, result, sz, maxlen);
 		break;
 		break;
 	}
 	}
 
 
@@ -2492,7 +2741,8 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
 
 
 	/*
 	/*
 	 * This is just a hint, and not enforced.  We have to cope with
 	 * This is just a hint, and not enforced.  We have to cope with
-	 * bios that overlap 2 blocks.
+	 * bios that cover a block partially.  A discard that spans a block
+	 * boundary is not sent to this target.
 	 */
 	 */
 	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
 	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
 	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
 	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
@@ -2513,7 +2763,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
 	.dtr = pool_dtr,
@@ -2618,20 +2868,31 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	}
 	__pool_inc(tc->pool);
 	__pool_inc(tc->pool);
 
 
+	if (get_pool_mode(tc->pool) == PM_FAIL) {
+		ti->error = "Couldn't open thin device, Pool is in fail mode";
+		goto bad_thin_open;
+	}
+
 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
 	if (r) {
 	if (r) {
 		ti->error = "Couldn't open thin internal device";
 		ti->error = "Couldn't open thin internal device";
 		goto bad_thin_open;
 		goto bad_thin_open;
 	}
 	}
 
 
-	ti->split_io = tc->pool->sectors_per_block;
+	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
+	if (r)
+		goto bad_thin_open;
+
 	ti->num_flush_requests = 1;
 	ti->num_flush_requests = 1;
+	ti->flush_supported = true;
 
 
 	/* In case the pool supports discards, pass them on. */
 	/* In case the pool supports discards, pass them on. */
 	if (tc->pool->pf.discard_enabled) {
 	if (tc->pool->pf.discard_enabled) {
-		ti->discards_supported = 1;
+		ti->discards_supported = true;
 		ti->num_discard_requests = 1;
 		ti->num_discard_requests = 1;
-		ti->discard_zeroes_data_unsupported = 1;
+		ti->discard_zeroes_data_unsupported = true;
+		/* Discard requests must be split on a block boundary */
+		ti->split_discard_requests = true;
 	}
 	}
 
 
 	dm_put(pool_md);
 	dm_put(pool_md);
@@ -2712,7 +2973,7 @@ static void thin_postsuspend(struct dm_target *ti)
  * <nr mapped sectors> <highest mapped sector>
  * <nr mapped sectors> <highest mapped sector>
  */
  */
 static int thin_status(struct dm_target *ti, status_type_t type,
 static int thin_status(struct dm_target *ti, status_type_t type,
-		       char *result, unsigned maxlen)
+		       unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	int r;
 	int r;
 	ssize_t sz = 0;
 	ssize_t sz = 0;
@@ -2720,6 +2981,11 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 	char buf[BDEVNAME_SIZE];
 	char buf[BDEVNAME_SIZE];
 	struct thin_c *tc = ti->private;
 	struct thin_c *tc = ti->private;
 
 
+	if (get_pool_mode(tc->pool) == PM_FAIL) {
+		DMEMIT("Fail");
+		return 0;
+	}
+
 	if (!tc->td)
 	if (!tc->td)
 		DMEMIT("-");
 		DMEMIT("-");
 	else {
 	else {
@@ -2757,19 +3023,21 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 static int thin_iterate_devices(struct dm_target *ti,
 static int thin_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 				iterate_devices_callout_fn fn, void *data)
 {
 {
-	dm_block_t blocks;
+	sector_t blocks;
 	struct thin_c *tc = ti->private;
 	struct thin_c *tc = ti->private;
+	struct pool *pool = tc->pool;
 
 
 	/*
 	/*
 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
 	 * we follow a more convoluted path through to the pool's target.
 	 * we follow a more convoluted path through to the pool's target.
 	 */
 	 */
-	if (!tc->pool->ti)
+	if (!pool->ti)
 		return 0;	/* nothing is bound */
 		return 0;	/* nothing is bound */
 
 
-	blocks = tc->pool->ti->len >> tc->pool->block_shift;
+	blocks = pool->ti->len;
+	(void) sector_div(blocks, pool->sectors_per_block);
 	if (blocks)
 	if (blocks)
-		return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
+		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -2786,7 +3054,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 
 static struct target_type thin_target = {
 static struct target_type thin_target = {
 	.name = "thin",
 	.name = "thin",
-	.version = {1, 1, 0},
+	.version = {1, 3, 0},
 	.module	= THIS_MODULE,
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.dtr = thin_dtr,

+ 1 - 1
drivers/md/dm-verity.c

@@ -515,7 +515,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio,
  * Status: V (valid) or C (corruption found)
  * Status: V (valid) or C (corruption found)
  */
  */
 static int verity_status(struct dm_target *ti, status_type_t type,
 static int verity_status(struct dm_target *ti, status_type_t type,
-			 char *result, unsigned maxlen)
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 {
 	struct dm_verity *v = ti->private;
 	struct dm_verity *v = ti->private;
 	unsigned sz = 0;
 	unsigned sz = 0;

+ 31 - 9
drivers/md/dm.c

@@ -968,22 +968,41 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti
 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 {
 {
 	sector_t len = max_io_len_target_boundary(sector, ti);
 	sector_t len = max_io_len_target_boundary(sector, ti);
+	sector_t offset, max_len;
 
 
 	/*
 	/*
-	 * Does the target need to split even further ?
+	 * Does the target need to split even further?
 	 */
 	 */
-	if (ti->split_io) {
-		sector_t boundary;
-		sector_t offset = dm_target_offset(ti, sector);
-		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
-			   - offset;
-		if (len > boundary)
-			len = boundary;
+	if (ti->max_io_len) {
+		offset = dm_target_offset(ti, sector);
+		if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
+			max_len = sector_div(offset, ti->max_io_len);
+		else
+			max_len = offset & (ti->max_io_len - 1);
+		max_len = ti->max_io_len - max_len;
+
+		if (len > max_len)
+			len = max_len;
 	}
 	}
 
 
 	return len;
 	return len;
 }
 }
 
 
+int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
+{
+	if (len > UINT_MAX) {
+		DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
+		      (unsigned long long)len, UINT_MAX);
+		ti->error = "Maximum size of target IO is too large";
+		return -EINVAL;
+	}
+
+	ti->max_io_len = (uint32_t) len;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
+
 static void __map_bio(struct dm_target *ti, struct bio *clone,
 static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct dm_target_io *tio)
 		      struct dm_target_io *tio)
 {
 {
@@ -1196,7 +1215,10 @@ static int __clone_and_map_discard(struct clone_info *ci)
 		if (!ti->num_discard_requests)
 		if (!ti->num_discard_requests)
 			return -EOPNOTSUPP;
 			return -EOPNOTSUPP;
 
 
-		len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+		if (!ti->split_discard_requests)
+			len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+		else
+			len = min(ci->sector_count, max_io_len(ci->sector, ti));
 
 
 		__issue_target_requests(ci, ti, ti->num_discard_requests, len);
 		__issue_target_requests(ci, ti, ti->num_discard_requests, len);
 
 

+ 5 - 0
drivers/md/dm.h

@@ -22,6 +22,11 @@
 #define DM_SUSPEND_LOCKFS_FLAG		(1 << 0)
 #define DM_SUSPEND_LOCKFS_FLAG		(1 << 0)
 #define DM_SUSPEND_NOFLUSH_FLAG		(1 << 1)
 #define DM_SUSPEND_NOFLUSH_FLAG		(1 << 1)
 
 
+/*
+ * Status feature flags
+ */
+#define DM_STATUS_NOFLUSH_FLAG		(1 << 0)
+
 /*
 /*
  * Type of table and mapped_device's mempool
  * Type of table and mapped_device's mempool
  */
  */

+ 0 - 1
drivers/md/persistent-data/Makefile

@@ -1,7 +1,6 @@
 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
 dm-persistent-data-objs := \
 dm-persistent-data-objs := \
 	dm-block-manager.o \
 	dm-block-manager.o \
-	dm-space-map-checker.o \
 	dm-space-map-common.o \
 	dm-space-map-common.o \
 	dm-space-map-disk.o \
 	dm-space-map-disk.o \
 	dm-space-map-metadata.o \
 	dm-space-map-metadata.o \

+ 59 - 46
drivers/md/persistent-data/dm-block-manager.c

@@ -325,11 +325,6 @@ static struct dm_buffer *to_buffer(struct dm_block *b)
 	return (struct dm_buffer *) b;
 	return (struct dm_buffer *) b;
 }
 }
 
 
-static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
-{
-	return (struct dm_bufio_client *) bm;
-}
-
 dm_block_t dm_block_location(struct dm_block *b)
 dm_block_t dm_block_location(struct dm_block *b)
 {
 {
 	return dm_bufio_get_block_number(to_buffer(b));
 	return dm_bufio_get_block_number(to_buffer(b));
@@ -367,34 +362,60 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf)
 /*----------------------------------------------------------------
 /*----------------------------------------------------------------
  * Public interface
  * Public interface
  *--------------------------------------------------------------*/
  *--------------------------------------------------------------*/
+struct dm_block_manager {
+	struct dm_bufio_client *bufio;
+	bool read_only:1;
+};
+
 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 						 unsigned block_size,
 						 unsigned block_size,
 						 unsigned cache_size,
 						 unsigned cache_size,
 						 unsigned max_held_per_thread)
 						 unsigned max_held_per_thread)
 {
 {
-	return (struct dm_block_manager *)
-		dm_bufio_client_create(bdev, block_size, max_held_per_thread,
-				       sizeof(struct buffer_aux),
-				       dm_block_manager_alloc_callback,
-				       dm_block_manager_write_callback);
+	int r;
+	struct dm_block_manager *bm;
+
+	bm = kmalloc(sizeof(*bm), GFP_KERNEL);
+	if (!bm) {
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
+					   sizeof(struct buffer_aux),
+					   dm_block_manager_alloc_callback,
+					   dm_block_manager_write_callback);
+	if (IS_ERR(bm->bufio)) {
+		r = PTR_ERR(bm->bufio);
+		kfree(bm);
+		goto bad;
+	}
+
+	bm->read_only = false;
+
+	return bm;
+
+bad:
+	return ERR_PTR(r);
 }
 }
 EXPORT_SYMBOL_GPL(dm_block_manager_create);
 EXPORT_SYMBOL_GPL(dm_block_manager_create);
 
 
 void dm_block_manager_destroy(struct dm_block_manager *bm)
 void dm_block_manager_destroy(struct dm_block_manager *bm)
 {
 {
-	return dm_bufio_client_destroy(to_bufio(bm));
+	dm_bufio_client_destroy(bm->bufio);
+	kfree(bm);
 }
 }
 EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
 EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
 
 
 unsigned dm_bm_block_size(struct dm_block_manager *bm)
 unsigned dm_bm_block_size(struct dm_block_manager *bm)
 {
 {
-	return dm_bufio_get_block_size(to_bufio(bm));
+	return dm_bufio_get_block_size(bm->bufio);
 }
 }
 EXPORT_SYMBOL_GPL(dm_bm_block_size);
 EXPORT_SYMBOL_GPL(dm_bm_block_size);
 
 
 dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
 dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
 {
 {
-	return dm_bufio_get_device_size(to_bufio(bm));
+	return dm_bufio_get_device_size(bm->bufio);
 }
 }
 
 
 static int dm_bm_validate_buffer(struct dm_block_manager *bm,
 static int dm_bm_validate_buffer(struct dm_block_manager *bm,
@@ -406,7 +427,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
 		int r;
 		int r;
 		if (!v)
 		if (!v)
 			return 0;
 			return 0;
-		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
+		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
 		if (unlikely(r))
 		if (unlikely(r))
 			return r;
 			return r;
 		aux->validator = v;
 		aux->validator = v;
@@ -430,7 +451,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
 	void *p;
 	void *p;
 	int r;
 	int r;
 
 
-	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 		return PTR_ERR(p);
 
 
@@ -463,7 +484,10 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
 	void *p;
 	void *p;
 	int r;
 	int r;
 
 
-	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (bm->read_only)
+		return -EPERM;
+
+	p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 		return PTR_ERR(p);
 
 
@@ -496,7 +520,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
 	void *p;
 	void *p;
 	int r;
 	int r;
 
 
-	p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
+	p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 		return PTR_ERR(p);
 	if (unlikely(!p))
 	if (unlikely(!p))
@@ -529,7 +553,10 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
 	struct buffer_aux *aux;
 	struct buffer_aux *aux;
 	void *p;
 	void *p;
 
 
-	p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (bm->read_only)
+		return -EPERM;
+
+	p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 		return PTR_ERR(p);
 
 
@@ -547,6 +574,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
 
 
 	return 0;
 	return 0;
 }
 }
+EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
 
 
 int dm_bm_unlock(struct dm_block *b)
 int dm_bm_unlock(struct dm_block *b)
 {
 {
@@ -565,45 +593,30 @@ int dm_bm_unlock(struct dm_block *b)
 }
 }
 EXPORT_SYMBOL_GPL(dm_bm_unlock);
 EXPORT_SYMBOL_GPL(dm_bm_unlock);
 
 
-int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
-{
-	struct buffer_aux *aux;
-
-	aux = dm_bufio_get_aux_data(to_buffer(b));
-
-	if (aux->write_locked) {
-		dm_bufio_mark_buffer_dirty(to_buffer(b));
-		bl_up_write(&aux->lock);
-	} else
-		bl_up_read(&aux->lock);
-
-	dm_bufio_release_move(to_buffer(b), n);
-	return 0;
-}
-
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 			   struct dm_block *superblock)
 			   struct dm_block *superblock)
 {
 {
 	int r;
 	int r;
 
 
-	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
-	if (unlikely(r))
-		return r;
-	r = dm_bufio_issue_flush(to_bufio(bm));
-	if (unlikely(r))
+	if (bm->read_only)
+		return -EPERM;
+
+	r = dm_bufio_write_dirty_buffers(bm->bufio);
+	if (unlikely(r)) {
+		dm_bm_unlock(superblock);
 		return r;
 		return r;
+	}
 
 
 	dm_bm_unlock(superblock);
 	dm_bm_unlock(superblock);
 
 
-	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
-	if (unlikely(r))
-		return r;
-	r = dm_bufio_issue_flush(to_bufio(bm));
-	if (unlikely(r))
-		return r;
+	return dm_bufio_write_dirty_buffers(bm->bufio);
+}
 
 
-	return 0;
+void dm_bm_set_read_only(struct dm_block_manager *bm)
+{
+	bm->read_only = true;
 }
 }
+EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
 
 
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
 {

+ 13 - 8
drivers/md/persistent-data/dm-block-manager.h

@@ -96,14 +96,6 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
 
 
 int dm_bm_unlock(struct dm_block *b);
 int dm_bm_unlock(struct dm_block *b);
 
 
-/*
- * An optimisation; we often want to copy a block's contents to a new
- * block.  eg, as part of the shadowing operation.  It's far better for
- * bufio to do this move behind the scenes than hold 2 locks and memcpy the
- * data.
- */
-int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
-
 /*
 /*
  * It's a common idiom to have a superblock that should be committed last.
  * It's a common idiom to have a superblock that should be committed last.
  *
  *
@@ -116,6 +108,19 @@ int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 			   struct dm_block *superblock);
 			   struct dm_block *superblock);
 
 
+/*
+ * Switches the bm to a read only mode.  Once read-only mode
+ * has been entered the following functions will return -EPERM.
+ *
+ *   dm_bm_write_lock
+ *   dm_bm_write_lock_zero
+ *   dm_bm_flush_and_unlock
+ *
+ * Additionally you should not use dm_bm_unlock_move, however no error will
+ * be returned if you do.
+ */
+void dm_bm_set_read_only(struct dm_block_manager *bm);
+
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
 
 
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/

+ 0 - 446
drivers/md/persistent-data/dm-space-map-checker.c

@@ -1,446 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-space-map-checker.h"
-
-#include <linux/device-mapper.h>
-#include <linux/export.h>
-#include <linux/vmalloc.h>
-
-#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
-
-#define DM_MSG_PREFIX "space map checker"
-
-/*----------------------------------------------------------------*/
-
-struct count_array {
-	dm_block_t nr;
-	dm_block_t nr_free;
-
-	uint32_t *counts;
-};
-
-static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	*count = ca->counts[b];
-	return 0;
-}
-
-static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	*r = ca->counts[b] > 1;
-	return 0;
-}
-
-static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
-{
-	uint32_t old_count;
-
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	old_count = ca->counts[b];
-
-	if (!count && old_count)
-		ca->nr_free++;
-
-	else if (count && !old_count)
-		ca->nr_free--;
-
-	ca->counts[b] = count;
-	return 0;
-}
-
-static int ca_inc_block(struct count_array *ca, dm_block_t b)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	ca_set_count(ca, b, ca->counts[b] + 1);
-	return 0;
-}
-
-static int ca_dec_block(struct count_array *ca, dm_block_t b)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	BUG_ON(ca->counts[b] == 0);
-	ca_set_count(ca, b, ca->counts[b] - 1);
-	return 0;
-}
-
-static int ca_create(struct count_array *ca, struct dm_space_map *sm)
-{
-	int r;
-	dm_block_t nr_blocks;
-
-	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
-	if (r)
-		return r;
-
-	ca->nr = nr_blocks;
-	ca->nr_free = nr_blocks;
-
-	if (!nr_blocks)
-		ca->counts = NULL;
-	else {
-		ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks);
-		if (!ca->counts)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void ca_destroy(struct count_array *ca)
-{
-	vfree(ca->counts);
-}
-
-static int ca_load(struct count_array *ca, struct dm_space_map *sm)
-{
-	int r;
-	uint32_t count;
-	dm_block_t nr_blocks, i;
-
-	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
-	if (r)
-		return r;
-
-	BUG_ON(ca->nr != nr_blocks);
-
-	DMWARN("Loading debug space map from disk.  This may take some time");
-	for (i = 0; i < nr_blocks; i++) {
-		r = dm_sm_get_count(sm, i, &count);
-		if (r) {
-			DMERR("load failed");
-			return r;
-		}
-
-		ca_set_count(ca, i, count);
-	}
-	DMWARN("Load complete");
-
-	return 0;
-}
-
-static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
-{
-	dm_block_t nr_blocks = ca->nr + extra_blocks;
-	uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks);
-	if (!counts)
-		return -ENOMEM;
-
-	if (ca->counts) {
-		memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
-		ca_destroy(ca);
-	}
-	ca->nr = nr_blocks;
-	ca->nr_free += extra_blocks;
-	ca->counts = counts;
-	return 0;
-}
-
-static int ca_commit(struct count_array *old, struct count_array *new)
-{
-	if (old->nr != new->nr) {
-		BUG_ON(old->nr > new->nr);
-		ca_extend(old, new->nr - old->nr);
-	}
-
-	BUG_ON(old->nr != new->nr);
-	old->nr_free = new->nr_free;
-	memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
-	return 0;
-}
-
-/*----------------------------------------------------------------*/
-
-struct sm_checker {
-	struct dm_space_map sm;
-
-	struct count_array old_counts;
-	struct count_array counts;
-
-	struct dm_space_map *real_sm;
-};
-
-static void sm_checker_destroy(struct dm_space_map *sm)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-
-	dm_sm_destroy(smc->real_sm);
-	ca_destroy(&smc->old_counts);
-	ca_destroy(&smc->counts);
-	kfree(smc);
-}
-
-static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_get_nr_blocks(smc->real_sm, count);
-	if (!r)
-		BUG_ON(smc->old_counts.nr != *count);
-	return r;
-}
-
-static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_get_nr_free(smc->real_sm, count);
-	if (!r) {
-		/*
-		 * Slow, but we know it's correct.
-		 */
-		dm_block_t b, n = 0;
-		for (b = 0; b < smc->old_counts.nr; b++)
-			if (smc->old_counts.counts[b] == 0 &&
-			    smc->counts.counts[b] == 0)
-				n++;
-
-		if (n != *count)
-			DMERR("free block counts differ, checker %u, sm-disk:%u",
-			      (unsigned) n, (unsigned) *count);
-	}
-	return r;
-}
-
-static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_new_block(smc->real_sm, b);
-
-	if (!r) {
-		BUG_ON(*b >= smc->old_counts.nr);
-		BUG_ON(smc->old_counts.counts[*b] != 0);
-		BUG_ON(*b >= smc->counts.nr);
-		BUG_ON(smc->counts.counts[*b] != 0);
-		ca_set_count(&smc->counts, *b, 1);
-	}
-
-	return r;
-}
-
-static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_inc_block(smc->real_sm, b);
-	int r2 = ca_inc_block(&smc->counts, b);
-	BUG_ON(r != r2);
-	return r;
-}
-
-static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_dec_block(smc->real_sm, b);
-	int r2 = ca_dec_block(&smc->counts, b);
-	BUG_ON(r != r2);
-	return r;
-}
-
-static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	uint32_t result2 = 0;
-	int r = dm_sm_get_count(smc->real_sm, b, result);
-	int r2 = ca_get_count(&smc->counts, b, &result2);
-
-	BUG_ON(r != r2);
-	if (!r)
-		BUG_ON(*result != result2);
-	return r;
-}
-
-static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int result2 = 0;
-	int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
-	int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
-
-	BUG_ON(r != r2);
-	if (!r)
-		BUG_ON(!(*result) && result2);
-	return r;
-}
-
-static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	uint32_t old_rc;
-	int r = dm_sm_set_count(smc->real_sm, b, count);
-	int r2;
-
-	BUG_ON(b >= smc->counts.nr);
-	old_rc = smc->counts.counts[b];
-	r2 = ca_set_count(&smc->counts, b, count);
-	BUG_ON(r != r2);
-
-	return r;
-}
-
-static int sm_checker_commit(struct dm_space_map *sm)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r;
-
-	r = dm_sm_commit(smc->real_sm);
-	if (r)
-		return r;
-
-	r = ca_commit(&smc->old_counts, &smc->counts);
-	if (r)
-		return r;
-
-	return 0;
-}
-
-static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_extend(smc->real_sm, extra_blocks);
-	if (r)
-		return r;
-
-	return ca_extend(&smc->counts, extra_blocks);
-}
-
-static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	return dm_sm_root_size(smc->real_sm, result);
-}
-
-static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
-}
-
-/*----------------------------------------------------------------*/
-
-static struct dm_space_map ops_ = {
-	.destroy = sm_checker_destroy,
-	.get_nr_blocks = sm_checker_get_nr_blocks,
-	.get_nr_free = sm_checker_get_nr_free,
-	.inc_block = sm_checker_inc_block,
-	.dec_block = sm_checker_dec_block,
-	.new_block = sm_checker_new_block,
-	.get_count = sm_checker_get_count,
-	.count_is_more_than_one = sm_checker_count_more_than_one,
-	.set_count = sm_checker_set_count,
-	.commit = sm_checker_commit,
-	.extend = sm_checker_extend,
-	.root_size = sm_checker_root_size,
-	.copy_root = sm_checker_copy_root
-};
-
-struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
-{
-	int r;
-	struct sm_checker *smc;
-
-	if (IS_ERR_OR_NULL(sm))
-		return ERR_PTR(-EINVAL);
-
-	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
-	if (!smc)
-		return ERR_PTR(-ENOMEM);
-
-	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
-	r = ca_create(&smc->old_counts, sm);
-	if (r) {
-		kfree(smc);
-		return ERR_PTR(r);
-	}
-
-	r = ca_create(&smc->counts, sm);
-	if (r) {
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return ERR_PTR(r);
-	}
-
-	smc->real_sm = sm;
-
-	r = ca_load(&smc->counts, sm);
-	if (r) {
-		ca_destroy(&smc->counts);
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return ERR_PTR(r);
-	}
-
-	r = ca_commit(&smc->old_counts, &smc->counts);
-	if (r) {
-		ca_destroy(&smc->counts);
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return ERR_PTR(r);
-	}
-
-	return &smc->sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create);
-
-struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
-{
-	int r;
-	struct sm_checker *smc;
-
-	if (IS_ERR_OR_NULL(sm))
-		return ERR_PTR(-EINVAL);
-
-	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
-	if (!smc)
-		return ERR_PTR(-ENOMEM);
-
-	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
-	r = ca_create(&smc->old_counts, sm);
-	if (r) {
-		kfree(smc);
-		return ERR_PTR(r);
-	}
-
-	r = ca_create(&smc->counts, sm);
-	if (r) {
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return ERR_PTR(r);
-	}
-
-	smc->real_sm = sm;
-	return &smc->sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
-
-/*----------------------------------------------------------------*/
-
-#else
-
-struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
-{
-	return sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create);
-
-struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
-{
-	return sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
-
-/*----------------------------------------------------------------*/
-
-#endif

+ 0 - 26
drivers/md/persistent-data/dm-space-map-checker.h

@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-
-#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
-#define SNAPSHOTS_SPACE_MAP_CHECKER_H
-
-#include "dm-space-map.h"
-
-/*----------------------------------------------------------------*/
-
-/*
- * This space map wraps a real on-disk space map, and verifies all of its
- * operations.  It uses a lot of memory, so only use if you have a specific
- * problem that you're debugging.
- *
- * Ownership of @sm passes.
- */
-struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
-struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
-
-/*----------------------------------------------------------------*/
-
-#endif

+ 11 - 1
drivers/md/persistent-data/dm-space-map-common.c

@@ -224,6 +224,7 @@ static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
 	ll->nr_blocks = 0;
 	ll->nr_blocks = 0;
 	ll->bitmap_root = 0;
 	ll->bitmap_root = 0;
 	ll->ref_count_root = 0;
 	ll->ref_count_root = 0;
+	ll->bitmap_index_changed = false;
 
 
 	return 0;
 	return 0;
 }
 }
@@ -476,7 +477,15 @@ int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
 
 
 int sm_ll_commit(struct ll_disk *ll)
 int sm_ll_commit(struct ll_disk *ll)
 {
 {
-	return ll->commit(ll);
+	int r = 0;
+
+	if (ll->bitmap_index_changed) {
+		r = ll->commit(ll);
+		if (!r)
+			ll->bitmap_index_changed = false;
+	}
+
+	return r;
 }
 }
 
 
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/
@@ -491,6 +500,7 @@ static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
 static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
 static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
 			       struct disk_index_entry *ie)
 			       struct disk_index_entry *ie)
 {
 {
+	ll->bitmap_index_changed = true;
 	memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
 	memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
 	return 0;
 	return 0;
 }
 }

+ 1 - 0
drivers/md/persistent-data/dm-space-map-common.h

@@ -78,6 +78,7 @@ struct ll_disk {
 	open_index_fn open_index;
 	open_index_fn open_index;
 	max_index_entries_fn max_entries;
 	max_index_entries_fn max_entries;
 	commit_fn commit;
 	commit_fn commit;
+	bool bitmap_index_changed:1;
 };
 };
 
 
 struct disk_sm_root {
 struct disk_sm_root {

+ 4 - 30
drivers/md/persistent-data/dm-space-map-disk.c

@@ -4,7 +4,6 @@
  * This file is released under the GPL.
  * This file is released under the GPL.
  */
  */
 
 
-#include "dm-space-map-checker.h"
 #include "dm-space-map-common.h"
 #include "dm-space-map-common.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map.h"
 #include "dm-space-map.h"
@@ -252,9 +251,8 @@ static struct dm_space_map ops = {
 	.copy_root = sm_disk_copy_root
 	.copy_root = sm_disk_copy_root
 };
 };
 
 
-static struct dm_space_map *dm_sm_disk_create_real(
-	struct dm_transaction_manager *tm,
-	dm_block_t nr_blocks)
+struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
+				       dm_block_t nr_blocks)
 {
 {
 	int r;
 	int r;
 	struct sm_disk *smd;
 	struct sm_disk *smd;
@@ -285,27 +283,10 @@ bad:
 	kfree(smd);
 	kfree(smd);
 	return ERR_PTR(r);
 	return ERR_PTR(r);
 }
 }
-
-struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
-				       dm_block_t nr_blocks)
-{
-	struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
-	struct dm_space_map *smc;
-
-	if (IS_ERR_OR_NULL(sm))
-		return sm;
-
-	smc = dm_sm_checker_create_fresh(sm);
-	if (IS_ERR(smc))
-		dm_sm_destroy(sm);
-
-	return smc;
-}
 EXPORT_SYMBOL_GPL(dm_sm_disk_create);
 EXPORT_SYMBOL_GPL(dm_sm_disk_create);
 
 
-static struct dm_space_map *dm_sm_disk_open_real(
-	struct dm_transaction_manager *tm,
-	void *root_le, size_t len)
+struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
+				     void *root_le, size_t len)
 {
 {
 	int r;
 	int r;
 	struct sm_disk *smd;
 	struct sm_disk *smd;
@@ -332,13 +313,6 @@ bad:
 	kfree(smd);
 	kfree(smd);
 	return ERR_PTR(r);
 	return ERR_PTR(r);
 }
 }
-
-struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
-				     void *root_le, size_t len)
-{
-	return dm_sm_checker_create(
-		dm_sm_disk_open_real(tm, root_le, len));
-}
 EXPORT_SYMBOL_GPL(dm_sm_disk_open);
 EXPORT_SYMBOL_GPL(dm_sm_disk_open);
 
 
 /*----------------------------------------------------------------*/
 /*----------------------------------------------------------------*/

+ 32 - 59
drivers/md/persistent-data/dm-transaction-manager.c

@@ -5,7 +5,6 @@
  */
  */
 #include "dm-transaction-manager.h"
 #include "dm-transaction-manager.h"
 #include "dm-space-map.h"
 #include "dm-space-map.h"
-#include "dm-space-map-checker.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map-metadata.h"
 #include "dm-space-map-metadata.h"
 #include "dm-persistent-data-internal.h"
 #include "dm-persistent-data-internal.h"
@@ -220,13 +219,24 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 	if (r < 0)
 	if (r < 0)
 		return r;
 		return r;
 
 
-	r = dm_bm_unlock_move(orig_block, new);
-	if (r < 0) {
+	/*
+	 * It would be tempting to use dm_bm_unlock_move here, but some
+	 * code, such as the space maps, keeps using the old data structures
+	 * secure in the knowledge they won't be changed until the next
+	 * transaction.  Using unlock_move would force a synchronous read
+	 * since the old block would no longer be in the cache.
+	 */
+	r = dm_bm_write_lock_zero(tm->bm, new, v, result);
+	if (r) {
 		dm_bm_unlock(orig_block);
 		dm_bm_unlock(orig_block);
 		return r;
 		return r;
 	}
 	}
 
 
-	return dm_bm_write_lock(tm->bm, new, v, result);
+	memcpy(dm_block_data(*result), dm_block_data(orig_block),
+	       dm_bm_block_size(tm->bm));
+
+	dm_bm_unlock(orig_block);
+	return r;
 }
 }
 
 
 int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
@@ -311,98 +321,61 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
 
 
 static int dm_tm_create_internal(struct dm_block_manager *bm,
 static int dm_tm_create_internal(struct dm_block_manager *bm,
 				 dm_block_t sb_location,
 				 dm_block_t sb_location,
-				 struct dm_block_validator *sb_validator,
-				 size_t root_offset, size_t root_max_len,
 				 struct dm_transaction_manager **tm,
 				 struct dm_transaction_manager **tm,
 				 struct dm_space_map **sm,
 				 struct dm_space_map **sm,
-				 struct dm_block **sblock,
-				 int create)
+				 int create,
+				 void *sm_root, size_t sm_len)
 {
 {
 	int r;
 	int r;
-	struct dm_space_map *inner;
 
 
-	inner = dm_sm_metadata_init();
-	if (IS_ERR(inner))
-		return PTR_ERR(inner);
+	*sm = dm_sm_metadata_init();
+	if (IS_ERR(*sm))
+		return PTR_ERR(*sm);
 
 
-	*tm = dm_tm_create(bm, inner);
+	*tm = dm_tm_create(bm, *sm);
 	if (IS_ERR(*tm)) {
 	if (IS_ERR(*tm)) {
-		dm_sm_destroy(inner);
+		dm_sm_destroy(*sm);
 		return PTR_ERR(*tm);
 		return PTR_ERR(*tm);
 	}
 	}
 
 
 	if (create) {
 	if (create) {
-		r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location,
-					  sb_validator, sblock);
-		if (r < 0) {
-			DMERR("couldn't lock superblock");
-			goto bad1;
-		}
-
-		r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
+		r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
 					  sb_location);
 					  sb_location);
 		if (r) {
 		if (r) {
 			DMERR("couldn't create metadata space map");
 			DMERR("couldn't create metadata space map");
-			goto bad2;
-		}
-
-		*sm = dm_sm_checker_create(inner);
-		if (IS_ERR(*sm)) {
-			r = PTR_ERR(*sm);
-			goto bad2;
+			goto bad;
 		}
 		}
 
 
 	} else {
 	} else {
-		r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
-				     sb_validator, sblock);
-		if (r < 0) {
-			DMERR("couldn't lock superblock");
-			goto bad1;
-		}
-
-		r = dm_sm_metadata_open(inner, *tm,
-					dm_block_data(*sblock) + root_offset,
-					root_max_len);
+		r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
 		if (r) {
 		if (r) {
 			DMERR("couldn't open metadata space map");
 			DMERR("couldn't open metadata space map");
-			goto bad2;
-		}
-
-		*sm = dm_sm_checker_create(inner);
-		if (IS_ERR(*sm)) {
-			r = PTR_ERR(*sm);
-			goto bad2;
+			goto bad;
 		}
 		}
 	}
 	}
 
 
 	return 0;
 	return 0;
 
 
-bad2:
-	dm_tm_unlock(*tm, *sblock);
-bad1:
+bad:
 	dm_tm_destroy(*tm);
 	dm_tm_destroy(*tm);
-	dm_sm_destroy(inner);
+	dm_sm_destroy(*sm);
 	return r;
 	return r;
 }
 }
 
 
 int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
 int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-			 struct dm_block_validator *sb_validator,
 			 struct dm_transaction_manager **tm,
 			 struct dm_transaction_manager **tm,
-			 struct dm_space_map **sm, struct dm_block **sblock)
+			 struct dm_space_map **sm)
 {
 {
-	return dm_tm_create_internal(bm, sb_location, sb_validator,
-				     0, 0, tm, sm, sblock, 1);
+	return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
 }
 }
 EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
 EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
 
 
 int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
 int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-		       struct dm_block_validator *sb_validator,
-		       size_t root_offset, size_t root_max_len,
+		       void *sm_root, size_t root_len,
 		       struct dm_transaction_manager **tm,
 		       struct dm_transaction_manager **tm,
-		       struct dm_space_map **sm, struct dm_block **sblock)
+		       struct dm_space_map **sm)
 {
 {
-	return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset,
-				     root_max_len, tm, sm, sblock, 0);
+	return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
 }
 }
 EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
 EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
 
 

+ 6 - 5
drivers/md/persistent-data/dm-transaction-manager.h

@@ -115,16 +115,17 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
  *
  *
  * Returns a tm that has an open transaction to write the new disk sm.
  * Returns a tm that has an open transaction to write the new disk sm.
  * Caller should store the new sm root and commit.
  * Caller should store the new sm root and commit.
+ *
+ * The superblock location is passed so the metadata space map knows it
+ * shouldn't be used.
  */
  */
 int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
 int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-			 struct dm_block_validator *sb_validator,
 			 struct dm_transaction_manager **tm,
 			 struct dm_transaction_manager **tm,
-			 struct dm_space_map **sm, struct dm_block **sblock);
+			 struct dm_space_map **sm);
 
 
 int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
 int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-		       struct dm_block_validator *sb_validator,
-		       size_t root_offset, size_t root_max_len,
+		       void *sm_root, size_t root_len,
 		       struct dm_transaction_manager **tm,
 		       struct dm_transaction_manager **tm,
-		       struct dm_space_map **sm, struct dm_block **sblock);
+		       struct dm_space_map **sm);
 
 
 #endif	/* _LINUX_DM_TRANSACTION_MANAGER_H */
 #endif	/* _LINUX_DM_TRANSACTION_MANAGER_H */

+ 22 - 7
include/linux/device-mapper.h

@@ -66,14 +66,13 @@ typedef int (*dm_request_endio_fn) (struct dm_target *ti,
 				    struct request *clone, int error,
 				    struct request *clone, int error,
 				    union map_info *map_context);
 				    union map_info *map_context);
 
 
-typedef void (*dm_flush_fn) (struct dm_target *ti);
 typedef void (*dm_presuspend_fn) (struct dm_target *ti);
 typedef void (*dm_presuspend_fn) (struct dm_target *ti);
 typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
 typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
 typedef int (*dm_preresume_fn) (struct dm_target *ti);
 typedef int (*dm_preresume_fn) (struct dm_target *ti);
 typedef void (*dm_resume_fn) (struct dm_target *ti);
 typedef void (*dm_resume_fn) (struct dm_target *ti);
 
 
 typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
 typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
-			     char *result, unsigned int maxlen);
+			     unsigned status_flags, char *result, unsigned maxlen);
 
 
 typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
 typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
 
 
@@ -139,7 +138,6 @@ struct target_type {
 	dm_map_request_fn map_rq;
 	dm_map_request_fn map_rq;
 	dm_endio_fn end_io;
 	dm_endio_fn end_io;
 	dm_request_endio_fn rq_end_io;
 	dm_request_endio_fn rq_end_io;
-	dm_flush_fn flush;
 	dm_presuspend_fn presuspend;
 	dm_presuspend_fn presuspend;
 	dm_postsuspend_fn postsuspend;
 	dm_postsuspend_fn postsuspend;
 	dm_preresume_fn preresume;
 	dm_preresume_fn preresume;
@@ -188,8 +186,8 @@ struct dm_target {
 	sector_t begin;
 	sector_t begin;
 	sector_t len;
 	sector_t len;
 
 
-	/* Always a power of 2 */
-	sector_t split_io;
+	/* If non-zero, maximum size of I/O submitted to a target. */
+	uint32_t max_io_len;
 
 
 	/*
 	/*
 	 * A number of zero-length barrier requests that will be submitted
 	 * A number of zero-length barrier requests that will be submitted
@@ -213,16 +211,28 @@ struct dm_target {
 	/* Used to provide an error string from the ctr */
 	/* Used to provide an error string from the ctr */
 	char *error;
 	char *error;
 
 
+	/*
+	 * Set if this target needs to receive flushes regardless of
+	 * whether or not its underlying devices have support.
+	 */
+	bool flush_supported:1;
+
 	/*
 	/*
 	 * Set if this target needs to receive discards regardless of
 	 * Set if this target needs to receive discards regardless of
 	 * whether or not its underlying devices have support.
 	 * whether or not its underlying devices have support.
 	 */
 	 */
-	unsigned discards_supported:1;
+	bool discards_supported:1;
+
+	/*
+	 * Set if the target required discard request to be split
+	 * on max_io_len boundary.
+	 */
+	bool split_discard_requests:1;
 
 
 	/*
 	/*
 	 * Set if this target does not return zeroes on discarded blocks.
 	 * Set if this target does not return zeroes on discarded blocks.
 	 */
 	 */
-	unsigned discard_zeroes_data_unsupported:1;
+	bool discard_zeroes_data_unsupported:1;
 };
 };
 
 
 /* Each target can link one of these into the table */
 /* Each target can link one of these into the table */
@@ -359,6 +369,11 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
  */
  */
 int dm_table_complete(struct dm_table *t);
 int dm_table_complete(struct dm_table *t);
 
 
+/*
+ * Target may require that it is never sent I/O larger than len.
+ */
+int __must_check dm_set_target_max_io_len(struct dm_target *ti, sector_t len);
+
 /*
 /*
  * Table reference counting.
  * Table reference counting.
  */
  */

+ 4 - 2
include/linux/dm-ioctl.h

@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 
 #define DM_VERSION_MAJOR	4
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	22
+#define DM_VERSION_MINOR	23
 #define DM_VERSION_PATCHLEVEL	0
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2011-10-19)"
+#define DM_VERSION_EXTRA	"-ioctl (2012-07-25)"
 
 
 /* Status bits */
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -307,6 +307,8 @@ enum {
 
 
 /*
 /*
  * Set this to suspend without flushing queued ios.
  * Set this to suspend without flushing queued ios.
+ * Also disables flushing uncommitted changes in the thin target before
+ * generating statistics for DM_TABLE_STATUS and DM_DEV_WAIT.
  */
  */
 #define DM_NOFLUSH_FLAG		(1 << 11) /* In */
 #define DM_NOFLUSH_FLAG		(1 << 11) /* In */